github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/fsimpl/tmpfs/regular_file.go (about)

     1  // Copyright 2019 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package tmpfs
    16  
    17  import (
    18  	"fmt"
    19  	"io"
    20  	"math"
    21  	"sync/atomic"
    22  
    23  	"github.com/SagerNet/gvisor/pkg/abi/linux"
    24  	"github.com/SagerNet/gvisor/pkg/context"
    25  	"github.com/SagerNet/gvisor/pkg/errors/linuxerr"
    26  	"github.com/SagerNet/gvisor/pkg/hostarch"
    27  	"github.com/SagerNet/gvisor/pkg/safemem"
    28  	"github.com/SagerNet/gvisor/pkg/sentry/fs"
    29  	"github.com/SagerNet/gvisor/pkg/sentry/fs/fsutil"
    30  	"github.com/SagerNet/gvisor/pkg/sentry/fsmetric"
    31  	"github.com/SagerNet/gvisor/pkg/sentry/kernel/auth"
    32  	"github.com/SagerNet/gvisor/pkg/sentry/memmap"
    33  	"github.com/SagerNet/gvisor/pkg/sentry/pgalloc"
    34  	"github.com/SagerNet/gvisor/pkg/sentry/usage"
    35  	"github.com/SagerNet/gvisor/pkg/sentry/vfs"
    36  	"github.com/SagerNet/gvisor/pkg/sync"
    37  	"github.com/SagerNet/gvisor/pkg/syserror"
    38  	"github.com/SagerNet/gvisor/pkg/usermem"
    39  )
    40  
    41  // regularFile is a regular (=S_IFREG) tmpfs file.
    42  //
    43  // +stateify savable
    44  type regularFile struct {
    45  	inode inode
    46  
    47  	// memFile is a platform.File used to allocate pages to this regularFile.
    48  	memFile *pgalloc.MemoryFile `state:"nosave"`
    49  
    50  	// memoryUsageKind is the memory accounting category under which pages backing
    51  	// this regularFile's contents are accounted.
    52  	memoryUsageKind usage.MemoryKind
    53  
    54  	// mapsMu protects mappings.
    55  	mapsMu sync.Mutex `state:"nosave"`
    56  
    57  	// mappings tracks mappings of the file into memmap.MappingSpaces.
    58  	//
    59  	// Protected by mapsMu.
    60  	mappings memmap.MappingSet
    61  
    62  	// writableMappingPages tracks how many pages of virtual memory are mapped
    63  	// as potentially writable from this file. If a page has multiple mappings,
    64  	// each mapping is counted separately.
    65  	//
    66  	// This counter is susceptible to overflow as we can potentially count
    67  	// mappings from many VMAs. We count pages rather than bytes to slightly
    68  	// mitigate this.
    69  	//
    70  	// Protected by mapsMu.
    71  	writableMappingPages uint64
    72  
    73  	// dataMu protects the fields below.
    74  	dataMu sync.RWMutex `state:"nosave"`
    75  
    76  	// data maps offsets into the file to offsets into memFile that store
    77  	// the file's data.
    78  	//
    79  	// Protected by dataMu.
    80  	data fsutil.FileRangeSet
    81  
    82  	// seals represents file seals on this inode.
    83  	//
    84  	// Protected by dataMu.
    85  	seals uint32
    86  
    87  	// size is the size of data.
    88  	//
    89  	// Protected by both dataMu and inode.mu; reading it requires holding
    90  	// either mutex, while writing requires holding both AND using atomics.
    91  	// Readers that do not require consistency (like Stat) may read the
    92  	// value atomically without holding either lock.
    93  	size uint64
    94  }
    95  
    96  func (fs *filesystem) newRegularFile(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode, parentDir *directory) *inode {
    97  	file := &regularFile{
    98  		memFile:         fs.mfp.MemoryFile(),
    99  		memoryUsageKind: usage.Tmpfs,
   100  		seals:           linux.F_SEAL_SEAL,
   101  	}
   102  	file.inode.init(file, fs, kuid, kgid, linux.S_IFREG|mode, parentDir)
   103  	file.inode.nlink = 1 // from parent directory
   104  	return &file.inode
   105  }
   106  
   107  // newUnlinkedRegularFileDescription creates a regular file on the tmpfs
   108  // filesystem represented by mount and returns an FD representing that file.
   109  // The new file is not reachable by path traversal from any other file.
   110  //
   111  // newUnlinkedRegularFileDescription is analogous to Linux's
   112  // mm/shmem.c:__shmem_file_setup().
   113  //
   114  // Preconditions: mount must be a tmpfs mount.
   115  func newUnlinkedRegularFileDescription(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, name string) (*regularFileFD, error) {
   116  	fs, ok := mount.Filesystem().Impl().(*filesystem)
   117  	if !ok {
   118  		panic("tmpfs.newUnlinkedRegularFileDescription() called with non-tmpfs mount")
   119  	}
   120  
   121  	inode := fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, 0777, nil /* parentDir */)
   122  	d := fs.newDentry(inode)
   123  	defer d.DecRef(ctx)
   124  	d.name = name
   125  
   126  	fd := &regularFileFD{}
   127  	fd.Init(&inode.locks)
   128  	flags := uint32(linux.O_RDWR)
   129  	if err := fd.vfsfd.Init(fd, flags, mount, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil {
   130  		return nil, err
   131  	}
   132  	return fd, nil
   133  }
   134  
   135  // NewZeroFile creates a new regular file and file description as for
   136  // mmap(MAP_SHARED | MAP_ANONYMOUS). The file has the given size and is
   137  // initially (implicitly) filled with zeroes.
   138  //
   139  // Preconditions: mount must be a tmpfs mount.
   140  func NewZeroFile(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, size uint64) (*vfs.FileDescription, error) {
   141  	// Compare mm/shmem.c:shmem_zero_setup().
   142  	fd, err := newUnlinkedRegularFileDescription(ctx, creds, mount, "dev/zero")
   143  	if err != nil {
   144  		return nil, err
   145  	}
   146  	rf := fd.inode().impl.(*regularFile)
   147  	rf.memoryUsageKind = usage.Anonymous
   148  	rf.size = size
   149  	return &fd.vfsfd, err
   150  }
   151  
   152  // NewMemfd creates a new regular file and file description as for
   153  // memfd_create.
   154  //
   155  // Preconditions: mount must be a tmpfs mount.
   156  func NewMemfd(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, allowSeals bool, name string) (*vfs.FileDescription, error) {
   157  	fd, err := newUnlinkedRegularFileDescription(ctx, creds, mount, name)
   158  	if err != nil {
   159  		return nil, err
   160  	}
   161  	if allowSeals {
   162  		fd.inode().impl.(*regularFile).seals = 0
   163  	}
   164  	return &fd.vfsfd, nil
   165  }
   166  
   167  // truncate grows or shrinks the file to the given size. It returns true if the
   168  // file size was updated.
   169  func (rf *regularFile) truncate(newSize uint64) (bool, error) {
   170  	rf.inode.mu.Lock()
   171  	defer rf.inode.mu.Unlock()
   172  	return rf.truncateLocked(newSize)
   173  }
   174  
   175  // Preconditions: rf.inode.mu must be held.
   176  func (rf *regularFile) truncateLocked(newSize uint64) (bool, error) {
   177  	oldSize := rf.size
   178  	if newSize == oldSize {
   179  		// Nothing to do.
   180  		return false, nil
   181  	}
   182  
   183  	// Need to hold inode.mu and dataMu while modifying size.
   184  	rf.dataMu.Lock()
   185  	if newSize > oldSize {
   186  		// Can we grow the file?
   187  		if rf.seals&linux.F_SEAL_GROW != 0 {
   188  			rf.dataMu.Unlock()
   189  			return false, linuxerr.EPERM
   190  		}
   191  		// We only need to update the file size.
   192  		atomic.StoreUint64(&rf.size, newSize)
   193  		rf.dataMu.Unlock()
   194  		return true, nil
   195  	}
   196  
   197  	// We are shrinking the file. First check if this is allowed.
   198  	if rf.seals&linux.F_SEAL_SHRINK != 0 {
   199  		rf.dataMu.Unlock()
   200  		return false, linuxerr.EPERM
   201  	}
   202  
   203  	// Update the file size.
   204  	atomic.StoreUint64(&rf.size, newSize)
   205  	rf.dataMu.Unlock()
   206  
   207  	// Invalidate past translations of truncated pages.
   208  	oldpgend := fs.OffsetPageEnd(int64(oldSize))
   209  	newpgend := fs.OffsetPageEnd(int64(newSize))
   210  	if newpgend < oldpgend {
   211  		rf.mapsMu.Lock()
   212  		rf.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{
   213  			// Compare Linux's mm/shmem.c:shmem_setattr() =>
   214  			// mm/memory.c:unmap_mapping_range(evencows=1).
   215  			InvalidatePrivate: true,
   216  		})
   217  		rf.mapsMu.Unlock()
   218  	}
   219  
   220  	// We are now guaranteed that there are no translations of truncated pages,
   221  	// and can remove them.
   222  	rf.dataMu.Lock()
   223  	rf.data.Truncate(newSize, rf.memFile)
   224  	rf.dataMu.Unlock()
   225  	return true, nil
   226  }
   227  
   228  // AddMapping implements memmap.Mappable.AddMapping.
   229  func (rf *regularFile) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) error {
   230  	rf.mapsMu.Lock()
   231  	defer rf.mapsMu.Unlock()
   232  	rf.dataMu.RLock()
   233  	defer rf.dataMu.RUnlock()
   234  
   235  	// Reject writable mapping if F_SEAL_WRITE is set.
   236  	if rf.seals&linux.F_SEAL_WRITE != 0 && writable {
   237  		return linuxerr.EPERM
   238  	}
   239  
   240  	rf.mappings.AddMapping(ms, ar, offset, writable)
   241  	if writable {
   242  		pagesBefore := rf.writableMappingPages
   243  
   244  		// ar is guaranteed to be page aligned per memmap.Mappable.
   245  		rf.writableMappingPages += uint64(ar.Length() / hostarch.PageSize)
   246  
   247  		if rf.writableMappingPages < pagesBefore {
   248  			panic(fmt.Sprintf("Overflow while mapping potentially writable pages pointing to a tmpfs file. Before %v, after %v", pagesBefore, rf.writableMappingPages))
   249  		}
   250  	}
   251  
   252  	return nil
   253  }
   254  
   255  // RemoveMapping implements memmap.Mappable.RemoveMapping.
   256  func (rf *regularFile) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) {
   257  	rf.mapsMu.Lock()
   258  	defer rf.mapsMu.Unlock()
   259  
   260  	rf.mappings.RemoveMapping(ms, ar, offset, writable)
   261  
   262  	if writable {
   263  		pagesBefore := rf.writableMappingPages
   264  
   265  		// ar is guaranteed to be page aligned per memmap.Mappable.
   266  		rf.writableMappingPages -= uint64(ar.Length() / hostarch.PageSize)
   267  
   268  		if rf.writableMappingPages > pagesBefore {
   269  			panic(fmt.Sprintf("Underflow while unmapping potentially writable pages pointing to a tmpfs file. Before %v, after %v", pagesBefore, rf.writableMappingPages))
   270  		}
   271  	}
   272  }
   273  
   274  // CopyMapping implements memmap.Mappable.CopyMapping.
   275  func (rf *regularFile) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR hostarch.AddrRange, offset uint64, writable bool) error {
   276  	return rf.AddMapping(ctx, ms, dstAR, offset, writable)
   277  }
   278  
   279  // Translate implements memmap.Mappable.Translate.
   280  func (rf *regularFile) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) {
   281  	rf.dataMu.Lock()
   282  	defer rf.dataMu.Unlock()
   283  
   284  	// Constrain translations to f.attr.Size (rounded up) to prevent
   285  	// translation to pages that may be concurrently truncated.
   286  	pgend := fs.OffsetPageEnd(int64(rf.size))
   287  	var beyondEOF bool
   288  	if required.End > pgend {
   289  		if required.Start >= pgend {
   290  			return nil, &memmap.BusError{io.EOF}
   291  		}
   292  		beyondEOF = true
   293  		required.End = pgend
   294  	}
   295  	if optional.End > pgend {
   296  		optional.End = pgend
   297  	}
   298  
   299  	cerr := rf.data.Fill(ctx, required, optional, rf.size, rf.memFile, rf.memoryUsageKind, func(_ context.Context, dsts safemem.BlockSeq, _ uint64) (uint64, error) {
   300  		// Newly-allocated pages are zeroed, so we don't need to do anything.
   301  		return dsts.NumBytes(), nil
   302  	})
   303  
   304  	var ts []memmap.Translation
   305  	var translatedEnd uint64
   306  	for seg := rf.data.FindSegment(required.Start); seg.Ok() && seg.Start() < required.End; seg, _ = seg.NextNonEmpty() {
   307  		segMR := seg.Range().Intersect(optional)
   308  		ts = append(ts, memmap.Translation{
   309  			Source: segMR,
   310  			File:   rf.memFile,
   311  			Offset: seg.FileRangeOf(segMR).Start,
   312  			Perms:  hostarch.AnyAccess,
   313  		})
   314  		translatedEnd = segMR.End
   315  	}
   316  
   317  	// Don't return the error returned by f.data.Fill if it occurred outside of
   318  	// required.
   319  	if translatedEnd < required.End && cerr != nil {
   320  		return ts, &memmap.BusError{cerr}
   321  	}
   322  	if beyondEOF {
   323  		return ts, &memmap.BusError{io.EOF}
   324  	}
   325  	return ts, nil
   326  }
   327  
   328  // InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
   329  func (*regularFile) InvalidateUnsavable(context.Context) error {
   330  	return nil
   331  }
   332  
   333  // +stateify savable
   334  type regularFileFD struct {
   335  	fileDescription
   336  
   337  	// off is the file offset. off is accessed using atomic memory operations.
   338  	// offMu serializes operations that may mutate off.
   339  	off   int64
   340  	offMu sync.Mutex `state:"nosave"`
   341  }
   342  
   343  // Release implements vfs.FileDescriptionImpl.Release.
   344  func (fd *regularFileFD) Release(context.Context) {
   345  	// noop
   346  }
   347  
   348  // Allocate implements vfs.FileDescriptionImpl.Allocate.
   349  func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error {
   350  	f := fd.inode().impl.(*regularFile)
   351  
   352  	f.inode.mu.Lock()
   353  	defer f.inode.mu.Unlock()
   354  	oldSize := f.size
   355  	size := offset + length
   356  	if oldSize >= size {
   357  		return nil
   358  	}
   359  	_, err := f.truncateLocked(size)
   360  	return err
   361  }
   362  
   363  // PRead implements vfs.FileDescriptionImpl.PRead.
   364  func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
   365  	start := fsmetric.StartReadWait()
   366  	defer fsmetric.FinishReadWait(fsmetric.TmpfsReadWait, start)
   367  	fsmetric.TmpfsReads.Increment()
   368  
   369  	if offset < 0 {
   370  		return 0, linuxerr.EINVAL
   371  	}
   372  
   373  	// Check that flags are supported. RWF_DSYNC/RWF_SYNC can be ignored since
   374  	// all state is in-memory.
   375  	//
   376  	// TODO(github.com/SagerNet/issue/2601): Support select preadv2 flags.
   377  	if opts.Flags&^(linux.RWF_HIPRI|linux.RWF_DSYNC|linux.RWF_SYNC) != 0 {
   378  		return 0, syserror.EOPNOTSUPP
   379  	}
   380  
   381  	if dst.NumBytes() == 0 {
   382  		return 0, nil
   383  	}
   384  	f := fd.inode().impl.(*regularFile)
   385  	rw := getRegularFileReadWriter(f, offset)
   386  	n, err := dst.CopyOutFrom(ctx, rw)
   387  	putRegularFileReadWriter(rw)
   388  	fd.inode().touchAtime(fd.vfsfd.Mount())
   389  	return n, err
   390  }
   391  
   392  // Read implements vfs.FileDescriptionImpl.Read.
   393  func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
   394  	fd.offMu.Lock()
   395  	n, err := fd.PRead(ctx, dst, fd.off, opts)
   396  	fd.off += n
   397  	fd.offMu.Unlock()
   398  	return n, err
   399  }
   400  
   401  // PWrite implements vfs.FileDescriptionImpl.PWrite.
   402  func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
   403  	n, _, err := fd.pwrite(ctx, src, offset, opts)
   404  	return n, err
   405  }
   406  
   407  // pwrite returns the number of bytes written, final offset and error. The
   408  // final offset should be ignored by PWrite.
   409  func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (written, finalOff int64, err error) {
   410  	if offset < 0 {
   411  		return 0, offset, linuxerr.EINVAL
   412  	}
   413  
   414  	// Check that flags are supported. RWF_DSYNC/RWF_SYNC can be ignored since
   415  	// all state is in-memory.
   416  	//
   417  	// TODO(github.com/SagerNet/issue/2601): Support select preadv2 flags.
   418  	if opts.Flags&^(linux.RWF_HIPRI|linux.RWF_DSYNC|linux.RWF_SYNC) != 0 {
   419  		return 0, offset, syserror.EOPNOTSUPP
   420  	}
   421  
   422  	srclen := src.NumBytes()
   423  	if srclen == 0 {
   424  		return 0, offset, nil
   425  	}
   426  	f := fd.inode().impl.(*regularFile)
   427  	f.inode.mu.Lock()
   428  	defer f.inode.mu.Unlock()
   429  	// If the file is opened with O_APPEND, update offset to file size.
   430  	if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 {
   431  		// Locking f.inode.mu is sufficient for reading f.size.
   432  		offset = int64(f.size)
   433  	}
   434  	if end := offset + srclen; end < offset {
   435  		// Overflow.
   436  		return 0, offset, linuxerr.EINVAL
   437  	}
   438  
   439  	srclen, err = vfs.CheckLimit(ctx, offset, srclen)
   440  	if err != nil {
   441  		return 0, offset, err
   442  	}
   443  	src = src.TakeFirst64(srclen)
   444  
   445  	rw := getRegularFileReadWriter(f, offset)
   446  	n, err := src.CopyInTo(ctx, rw)
   447  	f.inode.touchCMtimeLocked()
   448  	for {
   449  		old := atomic.LoadUint32(&f.inode.mode)
   450  		new := vfs.ClearSUIDAndSGID(old)
   451  		if swapped := atomic.CompareAndSwapUint32(&f.inode.mode, old, new); swapped {
   452  			break
   453  		}
   454  	}
   455  	putRegularFileReadWriter(rw)
   456  	return n, n + offset, err
   457  }
   458  
   459  // Write implements vfs.FileDescriptionImpl.Write.
   460  func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
   461  	fd.offMu.Lock()
   462  	n, off, err := fd.pwrite(ctx, src, fd.off, opts)
   463  	fd.off = off
   464  	fd.offMu.Unlock()
   465  	return n, err
   466  }
   467  
   468  // Seek implements vfs.FileDescriptionImpl.Seek.
   469  func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
   470  	fd.offMu.Lock()
   471  	defer fd.offMu.Unlock()
   472  	switch whence {
   473  	case linux.SEEK_SET:
   474  		// use offset as specified
   475  	case linux.SEEK_CUR:
   476  		offset += fd.off
   477  	case linux.SEEK_END:
   478  		offset += int64(atomic.LoadUint64(&fd.inode().impl.(*regularFile).size))
   479  	default:
   480  		return 0, linuxerr.EINVAL
   481  	}
   482  	if offset < 0 {
   483  		return 0, linuxerr.EINVAL
   484  	}
   485  	fd.off = offset
   486  	return offset, nil
   487  }
   488  
   489  // ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
   490  func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
   491  	file := fd.inode().impl.(*regularFile)
   492  	opts.SentryOwnedContent = true
   493  	return vfs.GenericConfigureMMap(&fd.vfsfd, file, opts)
   494  }
   495  
   496  // regularFileReadWriter implements safemem.Reader and Safemem.Writer.
   497  type regularFileReadWriter struct {
   498  	file *regularFile
   499  
   500  	// Offset into the file to read/write at. Note that this may be
   501  	// different from the FD offset if PRead/PWrite is used.
   502  	off uint64
   503  }
   504  
   505  var regularFileReadWriterPool = sync.Pool{
   506  	New: func() interface{} {
   507  		return &regularFileReadWriter{}
   508  	},
   509  }
   510  
   511  func getRegularFileReadWriter(file *regularFile, offset int64) *regularFileReadWriter {
   512  	rw := regularFileReadWriterPool.Get().(*regularFileReadWriter)
   513  	rw.file = file
   514  	rw.off = uint64(offset)
   515  	return rw
   516  }
   517  
   518  func putRegularFileReadWriter(rw *regularFileReadWriter) {
   519  	rw.file = nil
   520  	regularFileReadWriterPool.Put(rw)
   521  }
   522  
   523  // ReadToBlocks implements safemem.Reader.ReadToBlocks.
   524  func (rw *regularFileReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
   525  	rw.file.dataMu.RLock()
   526  	defer rw.file.dataMu.RUnlock()
   527  	size := rw.file.size
   528  
   529  	// Compute the range to read (limited by file size and overflow-checked).
   530  	if rw.off >= size {
   531  		return 0, io.EOF
   532  	}
   533  	end := size
   534  	if rend := rw.off + dsts.NumBytes(); rend > rw.off && rend < end {
   535  		end = rend
   536  	}
   537  
   538  	var done uint64
   539  	seg, gap := rw.file.data.Find(uint64(rw.off))
   540  	for rw.off < end {
   541  		mr := memmap.MappableRange{uint64(rw.off), uint64(end)}
   542  		switch {
   543  		case seg.Ok():
   544  			// Get internal mappings.
   545  			ims, err := rw.file.memFile.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), hostarch.Read)
   546  			if err != nil {
   547  				return done, err
   548  			}
   549  
   550  			// Copy from internal mappings.
   551  			n, err := safemem.CopySeq(dsts, ims)
   552  			done += n
   553  			rw.off += uint64(n)
   554  			dsts = dsts.DropFirst64(n)
   555  			if err != nil {
   556  				return done, err
   557  			}
   558  
   559  			// Continue.
   560  			seg, gap = seg.NextNonEmpty()
   561  
   562  		case gap.Ok():
   563  			// Tmpfs holes are zero-filled.
   564  			gapmr := gap.Range().Intersect(mr)
   565  			dst := dsts.TakeFirst64(gapmr.Length())
   566  			n, err := safemem.ZeroSeq(dst)
   567  			done += n
   568  			rw.off += uint64(n)
   569  			dsts = dsts.DropFirst64(n)
   570  			if err != nil {
   571  				return done, err
   572  			}
   573  
   574  			// Continue.
   575  			seg, gap = gap.NextSegment(), fsutil.FileRangeGapIterator{}
   576  		}
   577  	}
   578  	return done, nil
   579  }
   580  
   581  // WriteFromBlocks implements safemem.Writer.WriteFromBlocks.
   582  //
   583  // Preconditions: rw.file.inode.mu must be held.
   584  func (rw *regularFileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
   585  	// Hold dataMu so we can modify size.
   586  	rw.file.dataMu.Lock()
   587  	defer rw.file.dataMu.Unlock()
   588  
   589  	// Compute the range to write (overflow-checked).
   590  	end := rw.off + srcs.NumBytes()
   591  	if end <= rw.off {
   592  		end = math.MaxInt64
   593  	}
   594  
   595  	// Check if seals prevent either file growth or all writes.
   596  	switch {
   597  	case rw.file.seals&linux.F_SEAL_WRITE != 0: // Write sealed
   598  		return 0, linuxerr.EPERM
   599  	case end > rw.file.size && rw.file.seals&linux.F_SEAL_GROW != 0: // Grow sealed
   600  		// When growth is sealed, Linux effectively allows writes which would
   601  		// normally grow the file to partially succeed up to the current EOF,
   602  		// rounded down to the page boundary before the EOF.
   603  		//
   604  		// This happens because writes (and thus the growth check) for tmpfs
   605  		// files proceed page-by-page on Linux, and the final write to the page
   606  		// containing EOF fails, resulting in a partial write up to the start of
   607  		// that page.
   608  		//
   609  		// To emulate this behaviour, artifically truncate the write to the
   610  		// start of the page containing the current EOF.
   611  		//
   612  		// See Linux, mm/filemap.c:generic_perform_write() and
   613  		// mm/shmem.c:shmem_write_begin().
   614  		if pgstart := uint64(hostarch.Addr(rw.file.size).RoundDown()); end > pgstart {
   615  			end = pgstart
   616  		}
   617  		if end <= rw.off {
   618  			// Truncation would result in no data being written.
   619  			return 0, linuxerr.EPERM
   620  		}
   621  	}
   622  
   623  	// Page-aligned mr for when we need to allocate memory. RoundUp can't
   624  	// overflow since end is an int64.
   625  	pgstartaddr := hostarch.Addr(rw.off).RoundDown()
   626  	pgendaddr, _ := hostarch.Addr(end).RoundUp()
   627  	pgMR := memmap.MappableRange{uint64(pgstartaddr), uint64(pgendaddr)}
   628  
   629  	var (
   630  		done   uint64
   631  		retErr error
   632  	)
   633  	seg, gap := rw.file.data.Find(uint64(rw.off))
   634  	for rw.off < end {
   635  		mr := memmap.MappableRange{uint64(rw.off), uint64(end)}
   636  		switch {
   637  		case seg.Ok():
   638  			// Get internal mappings.
   639  			ims, err := rw.file.memFile.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), hostarch.Write)
   640  			if err != nil {
   641  				retErr = err
   642  				goto exitLoop
   643  			}
   644  
   645  			// Copy to internal mappings.
   646  			n, err := safemem.CopySeq(ims, srcs)
   647  			done += n
   648  			rw.off += uint64(n)
   649  			srcs = srcs.DropFirst64(n)
   650  			if err != nil {
   651  				retErr = err
   652  				goto exitLoop
   653  			}
   654  
   655  			// Continue.
   656  			seg, gap = seg.NextNonEmpty()
   657  
   658  		case gap.Ok():
   659  			// Allocate memory for the write.
   660  			gapMR := gap.Range().Intersect(pgMR)
   661  			fr, err := rw.file.memFile.Allocate(gapMR.Length(), rw.file.memoryUsageKind)
   662  			if err != nil {
   663  				retErr = err
   664  				goto exitLoop
   665  			}
   666  
   667  			// Write to that memory as usual.
   668  			seg, gap = rw.file.data.Insert(gap, gapMR, fr.Start), fsutil.FileRangeGapIterator{}
   669  
   670  		default:
   671  			panic("unreachable")
   672  		}
   673  	}
   674  exitLoop:
   675  	// If the write ends beyond the file's previous size, it causes the
   676  	// file to grow.
   677  	if rw.off > rw.file.size {
   678  		atomic.StoreUint64(&rw.file.size, rw.off)
   679  	}
   680  
   681  	return done, retErr
   682  }
   683  
   684  // GetSeals returns the current set of seals on a memfd inode.
   685  func GetSeals(fd *vfs.FileDescription) (uint32, error) {
   686  	f, ok := fd.Impl().(*regularFileFD)
   687  	if !ok {
   688  		return 0, linuxerr.EINVAL
   689  	}
   690  	rf := f.inode().impl.(*regularFile)
   691  	rf.dataMu.RLock()
   692  	defer rf.dataMu.RUnlock()
   693  	return rf.seals, nil
   694  }
   695  
   696  // AddSeals adds new file seals to a memfd inode.
   697  func AddSeals(fd *vfs.FileDescription, val uint32) error {
   698  	f, ok := fd.Impl().(*regularFileFD)
   699  	if !ok {
   700  		return linuxerr.EINVAL
   701  	}
   702  	rf := f.inode().impl.(*regularFile)
   703  	rf.mapsMu.Lock()
   704  	defer rf.mapsMu.Unlock()
   705  	rf.dataMu.RLock()
   706  	defer rf.dataMu.RUnlock()
   707  
   708  	if rf.seals&linux.F_SEAL_SEAL != 0 {
   709  		// Seal applied which prevents addition of any new seals.
   710  		return linuxerr.EPERM
   711  	}
   712  
   713  	// F_SEAL_WRITE can only be added if there are no active writable maps.
   714  	if rf.seals&linux.F_SEAL_WRITE == 0 && val&linux.F_SEAL_WRITE != 0 {
   715  		if rf.writableMappingPages > 0 {
   716  			return linuxerr.EBUSY
   717  		}
   718  	}
   719  
   720  	// Seals can only be added, never removed.
   721  	rf.seals |= val
   722  	return nil
   723  }