github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/fs/tmpfs/inode_file.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package tmpfs
    16  
    17  import (
    18  	"fmt"
    19  	"io"
    20  	"math"
    21  
    22  	"github.com/SagerNet/gvisor/pkg/abi/linux"
    23  	"github.com/SagerNet/gvisor/pkg/context"
    24  	"github.com/SagerNet/gvisor/pkg/errors/linuxerr"
    25  	"github.com/SagerNet/gvisor/pkg/hostarch"
    26  	"github.com/SagerNet/gvisor/pkg/safemem"
    27  	"github.com/SagerNet/gvisor/pkg/sentry/fs"
    28  	"github.com/SagerNet/gvisor/pkg/sentry/fs/fsutil"
    29  	"github.com/SagerNet/gvisor/pkg/sentry/fsmetric"
    30  	"github.com/SagerNet/gvisor/pkg/sentry/kernel"
    31  	ktime "github.com/SagerNet/gvisor/pkg/sentry/kernel/time"
    32  	"github.com/SagerNet/gvisor/pkg/sentry/memmap"
    33  	"github.com/SagerNet/gvisor/pkg/sentry/usage"
    34  	"github.com/SagerNet/gvisor/pkg/sync"
    35  	"github.com/SagerNet/gvisor/pkg/usermem"
    36  )
    37  
    38  // fileInodeOperations implements fs.InodeOperations for a regular tmpfs file.
    39  // These files are backed by pages allocated from a platform.Memory, and may be
    40  // directly mapped.
    41  //
    42  // Lock order: attrMu -> mapsMu -> dataMu.
    43  //
    44  // +stateify savable
    45  type fileInodeOperations struct {
    46  	fsutil.InodeGenericChecker `state:"nosave"`
    47  	fsutil.InodeNoopWriteOut   `state:"nosave"`
    48  	fsutil.InodeNotDirectory   `state:"nosave"`
    49  	fsutil.InodeNotSocket      `state:"nosave"`
    50  	fsutil.InodeNotSymlink     `state:"nosave"`
    51  
    52  	fsutil.InodeSimpleExtendedAttributes
    53  
    54  	// kernel is used to allocate memory that stores the file's contents.
    55  	kernel *kernel.Kernel
    56  
    57  	// memUsage is the default memory usage that will be reported by this file.
    58  	memUsage usage.MemoryKind
    59  
    60  	attrMu sync.Mutex `state:"nosave"`
    61  
    62  	// attr contains the unstable metadata for the file.
    63  	//
    64  	// attr is protected by attrMu. attr.Size is protected by both attrMu
    65  	// and dataMu; reading it requires locking either mutex, while mutating
    66  	// it requires locking both.
    67  	attr fs.UnstableAttr
    68  
    69  	mapsMu sync.Mutex `state:"nosave"`
    70  
    71  	// mappings tracks mappings of the file into memmap.MappingSpaces.
    72  	//
    73  	// mappings is protected by mapsMu.
    74  	mappings memmap.MappingSet
    75  
    76  	// writableMappingPages tracks how many pages of virtual memory are mapped
    77  	// as potentially writable from this file. If a page has multiple mappings,
    78  	// each mapping is counted separately.
    79  	//
    80  	// This counter is susceptible to overflow as we can potentially count
    81  	// mappings from many VMAs. We count pages rather than bytes to slightly
    82  	// mitigate this.
    83  	//
    84  	// Protected by mapsMu.
    85  	writableMappingPages uint64
    86  
    87  	dataMu sync.RWMutex `state:"nosave"`
    88  
    89  	// data maps offsets into the file to offsets into platform.Memory() that
    90  	// store the file's data.
    91  	//
    92  	// data is protected by dataMu.
    93  	data fsutil.FileRangeSet
    94  
    95  	// seals represents file seals on this inode.
    96  	//
    97  	// Protected by dataMu.
    98  	seals uint32
    99  }
   100  
   101  var _ fs.InodeOperations = (*fileInodeOperations)(nil)
   102  
   103  // NewInMemoryFile returns a new file backed by Kernel.MemoryFile().
   104  func NewInMemoryFile(ctx context.Context, usage usage.MemoryKind, uattr fs.UnstableAttr) fs.InodeOperations {
   105  	return &fileInodeOperations{
   106  		attr:     uattr,
   107  		kernel:   kernel.KernelFromContext(ctx),
   108  		memUsage: usage,
   109  		seals:    linux.F_SEAL_SEAL,
   110  	}
   111  }
   112  
   113  // NewMemfdInode creates a new inode backing a memfd. Memory used by the memfd
   114  // is backed by platform memory.
   115  func NewMemfdInode(ctx context.Context, allowSeals bool) *fs.Inode {
   116  	// Per Linux, mm/shmem.c:__shmem_file_setup(), memfd inodes are set up with
   117  	// S_IRWXUGO.
   118  	perms := fs.PermMask{Read: true, Write: true, Execute: true}
   119  	iops := NewInMemoryFile(ctx, usage.Tmpfs, fs.UnstableAttr{
   120  		Owner: fs.FileOwnerFromContext(ctx),
   121  		Perms: fs.FilePermissions{User: perms, Group: perms, Other: perms}}).(*fileInodeOperations)
   122  	if allowSeals {
   123  		iops.seals = 0
   124  	}
   125  	return fs.NewInode(ctx, iops, fs.NewNonCachingMountSource(ctx, nil, fs.MountSourceFlags{}), fs.StableAttr{
   126  		Type:      fs.RegularFile,
   127  		DeviceID:  tmpfsDevice.DeviceID(),
   128  		InodeID:   tmpfsDevice.NextIno(),
   129  		BlockSize: hostarch.PageSize,
   130  	})
   131  }
   132  
   133  // Release implements fs.InodeOperations.Release.
   134  func (f *fileInodeOperations) Release(context.Context) {
   135  	f.dataMu.Lock()
   136  	defer f.dataMu.Unlock()
   137  	f.data.DropAll(f.kernel.MemoryFile())
   138  }
   139  
   140  // Mappable implements fs.InodeOperations.Mappable.
   141  func (f *fileInodeOperations) Mappable(*fs.Inode) memmap.Mappable {
   142  	return f
   143  }
   144  
   145  // Rename implements fs.InodeOperations.Rename.
   146  func (*fileInodeOperations) Rename(ctx context.Context, inode *fs.Inode, oldParent *fs.Inode, oldName string, newParent *fs.Inode, newName string, replacement bool) error {
   147  	return rename(ctx, oldParent, oldName, newParent, newName, replacement)
   148  }
   149  
   150  // GetFile implements fs.InodeOperations.GetFile.
   151  func (f *fileInodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
   152  	if fs.IsSocket(d.Inode.StableAttr) {
   153  		return nil, linuxerr.ENXIO
   154  	}
   155  
   156  	if flags.Write {
   157  		fsmetric.TmpfsOpensW.Increment()
   158  	} else if flags.Read {
   159  		fsmetric.TmpfsOpensRO.Increment()
   160  	}
   161  	flags.Pread = true
   162  	flags.Pwrite = true
   163  	return fs.NewFile(ctx, d, flags, &regularFileOperations{iops: f}), nil
   164  }
   165  
   166  // UnstableAttr returns unstable attributes of this tmpfs file.
   167  func (f *fileInodeOperations) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) {
   168  	f.attrMu.Lock()
   169  	f.dataMu.RLock()
   170  	attr := f.attr
   171  	attr.Usage = int64(f.data.Span())
   172  	f.dataMu.RUnlock()
   173  	f.attrMu.Unlock()
   174  	return attr, nil
   175  }
   176  
   177  // Check implements fs.InodeOperations.Check.
   178  func (f *fileInodeOperations) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool {
   179  	return fs.ContextCanAccessFile(ctx, inode, p)
   180  }
   181  
   182  // SetPermissions implements fs.InodeOperations.SetPermissions.
   183  func (f *fileInodeOperations) SetPermissions(ctx context.Context, _ *fs.Inode, p fs.FilePermissions) bool {
   184  	f.attrMu.Lock()
   185  	f.attr.SetPermissions(ctx, p)
   186  	f.attrMu.Unlock()
   187  	return true
   188  }
   189  
   190  // SetTimestamps implements fs.InodeOperations.SetTimestamps.
   191  func (f *fileInodeOperations) SetTimestamps(ctx context.Context, _ *fs.Inode, ts fs.TimeSpec) error {
   192  	f.attrMu.Lock()
   193  	f.attr.SetTimestamps(ctx, ts)
   194  	f.attrMu.Unlock()
   195  	return nil
   196  }
   197  
   198  // SetOwner implements fs.InodeOperations.SetOwner.
   199  func (f *fileInodeOperations) SetOwner(ctx context.Context, _ *fs.Inode, owner fs.FileOwner) error {
   200  	f.attrMu.Lock()
   201  	f.attr.SetOwner(ctx, owner)
   202  	f.attrMu.Unlock()
   203  	return nil
   204  }
   205  
   206  // Truncate implements fs.InodeOperations.Truncate.
   207  func (f *fileInodeOperations) Truncate(ctx context.Context, _ *fs.Inode, size int64) error {
   208  	f.attrMu.Lock()
   209  	defer f.attrMu.Unlock()
   210  
   211  	f.dataMu.Lock()
   212  	oldSize := f.attr.Size
   213  
   214  	// Check if current seals allow truncation.
   215  	switch {
   216  	case size > oldSize && f.seals&linux.F_SEAL_GROW != 0: // Grow sealed
   217  		fallthrough
   218  	case oldSize > size && f.seals&linux.F_SEAL_SHRINK != 0: // Shrink sealed
   219  		f.dataMu.Unlock()
   220  		return linuxerr.EPERM
   221  	}
   222  
   223  	if oldSize != size {
   224  		f.attr.Size = size
   225  		// Update mtime and ctime.
   226  		now := ktime.NowFromContext(ctx)
   227  		f.attr.ModificationTime = now
   228  		f.attr.StatusChangeTime = now
   229  
   230  		// Truncating clears privilege bits.
   231  		f.attr.Perms.SetUID = false
   232  		if f.attr.Perms.Group.Execute {
   233  			f.attr.Perms.SetGID = false
   234  		}
   235  	}
   236  	f.dataMu.Unlock()
   237  
   238  	// Nothing left to do unless shrinking the file.
   239  	if oldSize <= size {
   240  		return nil
   241  	}
   242  
   243  	oldpgend := fs.OffsetPageEnd(oldSize)
   244  	newpgend := fs.OffsetPageEnd(size)
   245  
   246  	// Invalidate past translations of truncated pages.
   247  	if newpgend != oldpgend {
   248  		f.mapsMu.Lock()
   249  		f.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{
   250  			// Compare Linux's mm/shmem.c:shmem_setattr() =>
   251  			// mm/memory.c:unmap_mapping_range(evencows=1).
   252  			InvalidatePrivate: true,
   253  		})
   254  		f.mapsMu.Unlock()
   255  	}
   256  
   257  	// We are now guaranteed that there are no translations of truncated pages,
   258  	// and can remove them.
   259  	f.dataMu.Lock()
   260  	defer f.dataMu.Unlock()
   261  	f.data.Truncate(uint64(size), f.kernel.MemoryFile())
   262  
   263  	return nil
   264  }
   265  
   266  // Allocate implements fs.InodeOperations.Allocate.
   267  func (f *fileInodeOperations) Allocate(ctx context.Context, _ *fs.Inode, offset, length int64) error {
   268  	newSize := offset + length
   269  
   270  	f.attrMu.Lock()
   271  	defer f.attrMu.Unlock()
   272  	f.dataMu.Lock()
   273  	defer f.dataMu.Unlock()
   274  
   275  	if newSize <= f.attr.Size {
   276  		return nil
   277  	}
   278  
   279  	// Check if current seals allow growth.
   280  	if f.seals&linux.F_SEAL_GROW != 0 {
   281  		return linuxerr.EPERM
   282  	}
   283  
   284  	f.attr.Size = newSize
   285  
   286  	now := ktime.NowFromContext(ctx)
   287  	f.attr.ModificationTime = now
   288  	f.attr.StatusChangeTime = now
   289  
   290  	return nil
   291  }
   292  
   293  // AddLink implements fs.InodeOperations.AddLink.
   294  func (f *fileInodeOperations) AddLink() {
   295  	f.attrMu.Lock()
   296  	f.attr.Links++
   297  	f.attrMu.Unlock()
   298  }
   299  
   300  // DropLink implements fs.InodeOperations.DropLink.
   301  func (f *fileInodeOperations) DropLink() {
   302  	f.attrMu.Lock()
   303  	f.attr.Links--
   304  	f.attrMu.Unlock()
   305  }
   306  
   307  // NotifyStatusChange implements fs.InodeOperations.NotifyStatusChange.
   308  func (f *fileInodeOperations) NotifyStatusChange(ctx context.Context) {
   309  	f.attrMu.Lock()
   310  	f.attr.StatusChangeTime = ktime.NowFromContext(ctx)
   311  	f.attrMu.Unlock()
   312  }
   313  
   314  // IsVirtual implements fs.InodeOperations.IsVirtual.
   315  func (*fileInodeOperations) IsVirtual() bool {
   316  	return true
   317  }
   318  
   319  // StatFS implements fs.InodeOperations.StatFS.
   320  func (*fileInodeOperations) StatFS(context.Context) (fs.Info, error) {
   321  	return fsInfo, nil
   322  }
   323  
   324  func (f *fileInodeOperations) read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
   325  	start := fsmetric.StartReadWait()
   326  	defer fsmetric.FinishReadWait(fsmetric.TmpfsReadWait, start)
   327  	fsmetric.TmpfsReads.Increment()
   328  
   329  	// Zero length reads for tmpfs are no-ops.
   330  	if dst.NumBytes() == 0 {
   331  		return 0, nil
   332  	}
   333  
   334  	// Have we reached EOF? We check for this again in
   335  	// fileReadWriter.ReadToBlocks to avoid holding f.attrMu (which would
   336  	// serialize reads) or f.dataMu (which would violate lock ordering), but
   337  	// check here first (before calling into MM) since reading at EOF is
   338  	// common: getting a return value of 0 from a read syscall is the only way
   339  	// to detect EOF.
   340  	//
   341  	// TODO(jamieliu): Separate out f.attr.Size and use atomics instead of
   342  	// f.dataMu.
   343  	f.dataMu.RLock()
   344  	size := f.attr.Size
   345  	f.dataMu.RUnlock()
   346  	if offset >= size {
   347  		return 0, io.EOF
   348  	}
   349  
   350  	n, err := dst.CopyOutFrom(ctx, &fileReadWriter{f, offset})
   351  	if !file.Dirent.Inode.MountSource.Flags.NoAtime {
   352  		// Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed().
   353  		f.attrMu.Lock()
   354  		f.attr.AccessTime = ktime.NowFromContext(ctx)
   355  		f.attrMu.Unlock()
   356  	}
   357  	return n, err
   358  }
   359  
   360  func (f *fileInodeOperations) write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
   361  	// Zero length writes for tmpfs are no-ops.
   362  	if src.NumBytes() == 0 {
   363  		return 0, nil
   364  	}
   365  
   366  	f.attrMu.Lock()
   367  	defer f.attrMu.Unlock()
   368  	// Compare Linux's mm/filemap.c:__generic_file_write_iter() => file_update_time().
   369  	now := ktime.NowFromContext(ctx)
   370  	f.attr.ModificationTime = now
   371  	f.attr.StatusChangeTime = now
   372  	nwritten, err := src.CopyInTo(ctx, &fileReadWriter{f, offset})
   373  
   374  	// Writing clears privilege bits.
   375  	if nwritten > 0 {
   376  		f.attr.Perms.DropSetUIDAndMaybeGID()
   377  	}
   378  
   379  	return nwritten, err
   380  }
   381  
   382  type fileReadWriter struct {
   383  	f      *fileInodeOperations
   384  	offset int64
   385  }
   386  
   387  // ReadToBlocks implements safemem.Reader.ReadToBlocks.
   388  func (rw *fileReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
   389  	rw.f.dataMu.RLock()
   390  	defer rw.f.dataMu.RUnlock()
   391  
   392  	// Compute the range to read.
   393  	if rw.offset >= rw.f.attr.Size {
   394  		return 0, io.EOF
   395  	}
   396  	end := fs.ReadEndOffset(rw.offset, int64(dsts.NumBytes()), rw.f.attr.Size)
   397  	if end == rw.offset { // dsts.NumBytes() == 0?
   398  		return 0, nil
   399  	}
   400  
   401  	mf := rw.f.kernel.MemoryFile()
   402  	var done uint64
   403  	seg, gap := rw.f.data.Find(uint64(rw.offset))
   404  	for rw.offset < end {
   405  		mr := memmap.MappableRange{uint64(rw.offset), uint64(end)}
   406  		switch {
   407  		case seg.Ok():
   408  			// Get internal mappings.
   409  			ims, err := mf.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), hostarch.Read)
   410  			if err != nil {
   411  				return done, err
   412  			}
   413  
   414  			// Copy from internal mappings.
   415  			n, err := safemem.CopySeq(dsts, ims)
   416  			done += n
   417  			rw.offset += int64(n)
   418  			dsts = dsts.DropFirst64(n)
   419  			if err != nil {
   420  				return done, err
   421  			}
   422  
   423  			// Continue.
   424  			seg, gap = seg.NextNonEmpty()
   425  
   426  		case gap.Ok():
   427  			// Tmpfs holes are zero-filled.
   428  			gapmr := gap.Range().Intersect(mr)
   429  			dst := dsts.TakeFirst64(gapmr.Length())
   430  			n, err := safemem.ZeroSeq(dst)
   431  			done += n
   432  			rw.offset += int64(n)
   433  			dsts = dsts.DropFirst64(n)
   434  			if err != nil {
   435  				return done, err
   436  			}
   437  
   438  			// Continue.
   439  			seg, gap = gap.NextSegment(), fsutil.FileRangeGapIterator{}
   440  		}
   441  	}
   442  	return done, nil
   443  }
   444  
   445  // WriteFromBlocks implements safemem.Writer.WriteFromBlocks.
   446  func (rw *fileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
   447  	rw.f.dataMu.Lock()
   448  	defer rw.f.dataMu.Unlock()
   449  
   450  	// Compute the range to write.
   451  	if srcs.NumBytes() == 0 {
   452  		// Nothing to do.
   453  		return 0, nil
   454  	}
   455  	end := fs.WriteEndOffset(rw.offset, int64(srcs.NumBytes()))
   456  	if end == math.MaxInt64 {
   457  		// Overflow.
   458  		return 0, linuxerr.EINVAL
   459  	}
   460  
   461  	// Check if seals prevent either file growth or all writes.
   462  	switch {
   463  	case rw.f.seals&linux.F_SEAL_WRITE != 0: // Write sealed
   464  		return 0, linuxerr.EPERM
   465  	case end > rw.f.attr.Size && rw.f.seals&linux.F_SEAL_GROW != 0: // Grow sealed
   466  		// When growth is sealed, Linux effectively allows writes which would
   467  		// normally grow the file to partially succeed up to the current EOF,
   468  		// rounded down to the page boundary before the EOF.
   469  		//
   470  		// This happens because writes (and thus the growth check) for tmpfs
   471  		// files proceed page-by-page on Linux, and the final write to the page
   472  		// containing EOF fails, resulting in a partial write up to the start of
   473  		// that page.
   474  		//
   475  		// To emulate this behaviour, artifically truncate the write to the
   476  		// start of the page containing the current EOF.
   477  		//
   478  		// See Linux, mm/filemap.c:generic_perform_write() and
   479  		// mm/shmem.c:shmem_write_begin().
   480  		if pgstart := int64(hostarch.Addr(rw.f.attr.Size).RoundDown()); end > pgstart {
   481  			end = pgstart
   482  		}
   483  		if end <= rw.offset {
   484  			// Truncation would result in no data being written.
   485  			return 0, linuxerr.EPERM
   486  		}
   487  	}
   488  
   489  	defer func() {
   490  		// If the write ends beyond the file's previous size, it causes the
   491  		// file to grow.
   492  		if rw.offset > rw.f.attr.Size {
   493  			rw.f.attr.Size = rw.offset
   494  		}
   495  	}()
   496  
   497  	mf := rw.f.kernel.MemoryFile()
   498  	// Page-aligned mr for when we need to allocate memory. RoundUp can't
   499  	// overflow since end is an int64.
   500  	pgstartaddr := hostarch.Addr(rw.offset).RoundDown()
   501  	pgendaddr, _ := hostarch.Addr(end).RoundUp()
   502  	pgMR := memmap.MappableRange{uint64(pgstartaddr), uint64(pgendaddr)}
   503  
   504  	var done uint64
   505  	seg, gap := rw.f.data.Find(uint64(rw.offset))
   506  	for rw.offset < end {
   507  		mr := memmap.MappableRange{uint64(rw.offset), uint64(end)}
   508  		switch {
   509  		case seg.Ok():
   510  			// Get internal mappings.
   511  			ims, err := mf.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), hostarch.Write)
   512  			if err != nil {
   513  				return done, err
   514  			}
   515  
   516  			// Copy to internal mappings.
   517  			n, err := safemem.CopySeq(ims, srcs)
   518  			done += n
   519  			rw.offset += int64(n)
   520  			srcs = srcs.DropFirst64(n)
   521  			if err != nil {
   522  				return done, err
   523  			}
   524  
   525  			// Continue.
   526  			seg, gap = seg.NextNonEmpty()
   527  
   528  		case gap.Ok():
   529  			// Allocate memory for the write.
   530  			gapMR := gap.Range().Intersect(pgMR)
   531  			fr, err := mf.Allocate(gapMR.Length(), rw.f.memUsage)
   532  			if err != nil {
   533  				return done, err
   534  			}
   535  
   536  			// Write to that memory as usual.
   537  			seg, gap = rw.f.data.Insert(gap, gapMR, fr.Start), fsutil.FileRangeGapIterator{}
   538  		}
   539  	}
   540  	return done, nil
   541  }
   542  
   543  // AddMapping implements memmap.Mappable.AddMapping.
   544  func (f *fileInodeOperations) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) error {
   545  	f.mapsMu.Lock()
   546  	defer f.mapsMu.Unlock()
   547  
   548  	f.dataMu.RLock()
   549  	defer f.dataMu.RUnlock()
   550  
   551  	// Reject writable mapping if F_SEAL_WRITE is set.
   552  	if f.seals&linux.F_SEAL_WRITE != 0 && writable {
   553  		return linuxerr.EPERM
   554  	}
   555  
   556  	f.mappings.AddMapping(ms, ar, offset, writable)
   557  	if writable {
   558  		pagesBefore := f.writableMappingPages
   559  
   560  		// ar is guaranteed to be page aligned per memmap.Mappable.
   561  		f.writableMappingPages += uint64(ar.Length() / hostarch.PageSize)
   562  
   563  		if f.writableMappingPages < pagesBefore {
   564  			panic(fmt.Sprintf("Overflow while mapping potentially writable pages pointing to a tmpfs file. Before %v, after %v", pagesBefore, f.writableMappingPages))
   565  		}
   566  	}
   567  
   568  	return nil
   569  }
   570  
   571  // RemoveMapping implements memmap.Mappable.RemoveMapping.
   572  func (f *fileInodeOperations) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) {
   573  	f.mapsMu.Lock()
   574  	defer f.mapsMu.Unlock()
   575  
   576  	f.mappings.RemoveMapping(ms, ar, offset, writable)
   577  
   578  	if writable {
   579  		pagesBefore := f.writableMappingPages
   580  
   581  		// ar is guaranteed to be page aligned per memmap.Mappable.
   582  		f.writableMappingPages -= uint64(ar.Length() / hostarch.PageSize)
   583  
   584  		if f.writableMappingPages > pagesBefore {
   585  			panic(fmt.Sprintf("Underflow while unmapping potentially writable pages pointing to a tmpfs file. Before %v, after %v", pagesBefore, f.writableMappingPages))
   586  		}
   587  	}
   588  }
   589  
   590  // CopyMapping implements memmap.Mappable.CopyMapping.
   591  func (f *fileInodeOperations) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR hostarch.AddrRange, offset uint64, writable bool) error {
   592  	return f.AddMapping(ctx, ms, dstAR, offset, writable)
   593  }
   594  
   595  // Translate implements memmap.Mappable.Translate.
   596  func (f *fileInodeOperations) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) {
   597  	f.dataMu.Lock()
   598  	defer f.dataMu.Unlock()
   599  
   600  	// Constrain translations to f.attr.Size (rounded up) to prevent
   601  	// translation to pages that may be concurrently truncated.
   602  	pgend := fs.OffsetPageEnd(f.attr.Size)
   603  	var beyondEOF bool
   604  	if required.End > pgend {
   605  		if required.Start >= pgend {
   606  			return nil, &memmap.BusError{io.EOF}
   607  		}
   608  		beyondEOF = true
   609  		required.End = pgend
   610  	}
   611  	if optional.End > pgend {
   612  		optional.End = pgend
   613  	}
   614  
   615  	mf := f.kernel.MemoryFile()
   616  	cerr := f.data.Fill(ctx, required, optional, uint64(f.attr.Size), mf, f.memUsage, func(_ context.Context, dsts safemem.BlockSeq, _ uint64) (uint64, error) {
   617  		// Newly-allocated pages are zeroed, so we don't need to do anything.
   618  		return dsts.NumBytes(), nil
   619  	})
   620  
   621  	var ts []memmap.Translation
   622  	var translatedEnd uint64
   623  	for seg := f.data.FindSegment(required.Start); seg.Ok() && seg.Start() < required.End; seg, _ = seg.NextNonEmpty() {
   624  		segMR := seg.Range().Intersect(optional)
   625  		ts = append(ts, memmap.Translation{
   626  			Source: segMR,
   627  			File:   mf,
   628  			Offset: seg.FileRangeOf(segMR).Start,
   629  			Perms:  hostarch.AnyAccess,
   630  		})
   631  		translatedEnd = segMR.End
   632  	}
   633  
   634  	// Don't return the error returned by f.data.Fill if it occurred outside of
   635  	// required.
   636  	if translatedEnd < required.End && cerr != nil {
   637  		return ts, &memmap.BusError{cerr}
   638  	}
   639  	if beyondEOF {
   640  		return ts, &memmap.BusError{io.EOF}
   641  	}
   642  	return ts, nil
   643  }
   644  
   645  // InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
   646  func (f *fileInodeOperations) InvalidateUnsavable(ctx context.Context) error {
   647  	return nil
   648  }
   649  
   650  // GetSeals returns the current set of seals on a memfd inode.
   651  func GetSeals(inode *fs.Inode) (uint32, error) {
   652  	if f, ok := inode.InodeOperations.(*fileInodeOperations); ok {
   653  		f.dataMu.RLock()
   654  		defer f.dataMu.RUnlock()
   655  		return f.seals, nil
   656  	}
   657  	// Not a memfd inode.
   658  	return 0, linuxerr.EINVAL
   659  }
   660  
   661  // AddSeals adds new file seals to a memfd inode.
   662  func AddSeals(inode *fs.Inode, val uint32) error {
   663  	if f, ok := inode.InodeOperations.(*fileInodeOperations); ok {
   664  		f.mapsMu.Lock()
   665  		defer f.mapsMu.Unlock()
   666  		f.dataMu.Lock()
   667  		defer f.dataMu.Unlock()
   668  
   669  		if f.seals&linux.F_SEAL_SEAL != 0 {
   670  			// Seal applied which prevents addition of any new seals.
   671  			return linuxerr.EPERM
   672  		}
   673  
   674  		// F_SEAL_WRITE can only be added if there are no active writable maps.
   675  		if f.seals&linux.F_SEAL_WRITE == 0 && val&linux.F_SEAL_WRITE != 0 {
   676  			if f.writableMappingPages > 0 {
   677  				return linuxerr.EBUSY
   678  			}
   679  		}
   680  
   681  		// Seals can only be added, never removed.
   682  		f.seals |= val
   683  		return nil
   684  	}
   685  	// Not a memfd inode.
   686  	return linuxerr.EINVAL
   687  }