github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/fs/copy_up.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package fs
    16  
    17  import (
    18  	"fmt"
    19  	"io"
    20  
    21  	"github.com/SagerNet/gvisor/pkg/abi/linux"
    22  	"github.com/SagerNet/gvisor/pkg/context"
    23  	"github.com/SagerNet/gvisor/pkg/errors/linuxerr"
    24  	"github.com/SagerNet/gvisor/pkg/hostarch"
    25  	"github.com/SagerNet/gvisor/pkg/log"
    26  	"github.com/SagerNet/gvisor/pkg/sentry/memmap"
    27  	"github.com/SagerNet/gvisor/pkg/sync"
    28  	"github.com/SagerNet/gvisor/pkg/syserror"
    29  	"github.com/SagerNet/gvisor/pkg/usermem"
    30  )
    31  
    32  // copyUp copies a file in an overlay from a lower filesystem to an
    33  // upper filesytem so that the file can be modified in the upper
    34  // filesystem. Copying a file involves several steps:
    35  //
    36  // - All parent directories of the file are created in the upper
    37  //   filesystem if they don't exist there. For instance:
    38  //
    39  //     upper /dir0
    40  //     lower /dir0/dir1/file
    41  //
    42  //   copyUp of /dir0/dir1/file creates /dir0/dir1 in order to create
    43  //   /dir0/dir1/file.
    44  //
    45  // - The file content is copied from the lower file to the upper
    46  //   file. For symlinks this is the symlink target. For directories,
    47  //   upper directory entries are merged with lower directory entries
    48  //   so there is no need to copy any entries.
    49  //
    50  // - A subset of file attributes of the lower file are set on the
    51  //   upper file. These are the file owner, the file timestamps,
    52  //   and all non-overlay extended attributes. copyUp will fail if
    53  //   the upper filesystem does not support the setting of these
    54  //   attributes.
    55  //
    56  //   The file's permissions are set when the file is created and its
    57  //   size will be brought up to date when its contents are copied.
    58  //   Notably no attempt is made to bring link count up to date because
    59  //   hard links are currently not preserved across overlay filesystems.
    60  //
    61  // - Memory mappings of the lower file are invalidated and memory
    62  //   references are transferred to the upper file. From this point on,
    63  //   memory mappings of the file will be backed by content in the upper
    64  //   filesystem.
    65  //
    66  // Synchronization:
    67  //
    68  // copyUp synchronizes with rename(2) using renameMu to ensure that
    69  // parentage does not change while a file is being copied. In the context
    70  // of rename(2), copyUpLockedForRename should be used to avoid deadlock on
    71  // renameMu.
    72  //
    73  // The following operations synchronize with copyUp using copyMu:
    74  //
    75  // - InodeOperations, i.e. to ensure that looking up a directory takes
    76  //   into account new upper filesystem directories created by copy up,
    77  //   which subsequently can be modified.
    78  //
    79  // - FileOperations, i.e. to ensure that reading from a file does not
    80  //   continue using a stale, lower filesystem handle when the file is
    81  //   written to.
    82  //
    83  // Lock ordering: Dirent.mu -> Inode.overlay.copyMu -> Inode.mu.
    84  //
    85  // Caveats:
    86  //
    87  // If any step in copying up a file fails, copyUp cleans the upper
    88  // filesystem of any partially up-to-date file. If this cleanup fails,
    89  // the overlay may be in an unacceptable, inconsistent state, so copyUp
    90  // panics. If copyUp fails because any step (above) fails, a generic
    91  // error is returned.
    92  //
    93  // copyUp currently makes no attempt to optimize copying up file content.
    94  // For large files, this means that copyUp blocks until the entire file
    95  // is copied synchronously.
    96  func copyUp(ctx context.Context, d *Dirent) error {
    97  	renameMu.RLock()
    98  	defer renameMu.RUnlock()
    99  	return copyUpLockedForRename(ctx, d)
   100  }
   101  
   102  // copyUpLockedForRename is the same as copyUp except that it does not lock
   103  // renameMu.
   104  //
   105  // It copies each component of d that does not yet exist in the upper
   106  // filesystem. If d already exists in the upper filesystem, it is a no-op.
   107  //
   108  // Any error returned indicates a failure to copy all of d. This may
   109  // leave the upper filesystem filled with any number of parent directories
   110  // but the upper filesystem will never be in an inconsistent state.
   111  //
   112  // Preconditions: d.Inode.overlay is non-nil.
   113  func copyUpLockedForRename(ctx context.Context, d *Dirent) error {
   114  	for {
   115  		// Did we race with another copy up or does there
   116  		// already exist something in the upper filesystem
   117  		// for d?
   118  		d.Inode.overlay.copyMu.RLock()
   119  		if d.Inode.overlay.upper != nil {
   120  			d.Inode.overlay.copyMu.RUnlock()
   121  			// Done, d is in the upper filesystem.
   122  			return nil
   123  		}
   124  		d.Inode.overlay.copyMu.RUnlock()
   125  
   126  		// Find the next component to copy up. We will work our way
   127  		// down to the last component of d and finally copy it.
   128  		next := findNextCopyUp(ctx, d)
   129  
   130  		// Attempt to copy.
   131  		if err := doCopyUp(ctx, next); err != nil {
   132  			return err
   133  		}
   134  	}
   135  }
   136  
   137  // findNextCopyUp finds the next component of d from root that does not
   138  // yet exist in the upper filesystem. The parent of this component is
   139  // also returned, which is the root of the overlay in the worst case.
   140  func findNextCopyUp(ctx context.Context, d *Dirent) *Dirent {
   141  	next := d
   142  	for parent := next.parent; ; /* checked in-loop */ /* updated in-loop */ {
   143  		// Does this parent have a non-nil upper Inode?
   144  		parent.Inode.overlay.copyMu.RLock()
   145  		if parent.Inode.overlay.upper != nil {
   146  			parent.Inode.overlay.copyMu.RUnlock()
   147  			// Note that since we found an upper, it is stable.
   148  			return next
   149  		}
   150  		parent.Inode.overlay.copyMu.RUnlock()
   151  
   152  		// Continue searching for a parent with a non-nil
   153  		// upper Inode.
   154  		next = parent
   155  		parent = next.parent
   156  	}
   157  }
   158  
   159  func doCopyUp(ctx context.Context, d *Dirent) error {
   160  	// Fail fast on Inode types we won't be able to copy up anyways. These
   161  	// Inodes may block in GetFile while holding copyMu for reading. If we
   162  	// then try to take copyMu for writing here, we'd deadlock.
   163  	t := d.Inode.overlay.lower.StableAttr.Type
   164  	if t != RegularFile && t != Directory && t != Symlink {
   165  		return linuxerr.EINVAL
   166  	}
   167  
   168  	// Wait to get exclusive access to the upper Inode.
   169  	d.Inode.overlay.copyMu.Lock()
   170  	defer d.Inode.overlay.copyMu.Unlock()
   171  	if d.Inode.overlay.upper != nil {
   172  		// We raced with another doCopyUp, no problem.
   173  		return nil
   174  	}
   175  
   176  	// Perform the copy.
   177  	return copyUpLocked(ctx, d.parent, d)
   178  }
   179  
   180  // copyUpLocked creates a copy of next in the upper filesystem of parent.
   181  //
   182  // copyUpLocked must be called with d.Inode.overlay.copyMu locked.
   183  //
   184  // Returns a generic error on failure.
   185  //
   186  // Preconditions:
   187  // * parent.Inode.overlay.upper must be non-nil.
   188  // * next.Inode.overlay.copyMu must be locked writable.
   189  // * next.Inode.overlay.lower must be non-nil.
   190  // * next.Inode.overlay.lower.StableAttr.Type must be RegularFile, Directory,
   191  //   or Symlink.
   192  // * upper filesystem must support setting file ownership and timestamps.
   193  func copyUpLocked(ctx context.Context, parent *Dirent, next *Dirent) error {
   194  	// Extract the attributes of the file we wish to copy.
   195  	attrs, err := next.Inode.overlay.lower.UnstableAttr(ctx)
   196  	if err != nil {
   197  		log.Warningf("copy up failed to get lower attributes: %v", err)
   198  		return syserror.EIO
   199  	}
   200  
   201  	var childUpperInode *Inode
   202  	parentUpper := parent.Inode.overlay.upper
   203  	root := RootFromContext(ctx)
   204  	if root != nil {
   205  		defer root.DecRef(ctx)
   206  	}
   207  
   208  	// Create the file in the upper filesystem and get an Inode for it.
   209  	switch next.Inode.StableAttr.Type {
   210  	case RegularFile:
   211  		childFile, err := parentUpper.Create(ctx, root, next.name, FileFlags{Read: true, Write: true}, attrs.Perms)
   212  		if err != nil {
   213  			log.Warningf("copy up failed to create file: %v", err)
   214  			return syserror.EIO
   215  		}
   216  		defer childFile.DecRef(ctx)
   217  		childUpperInode = childFile.Dirent.Inode
   218  
   219  	case Directory:
   220  		if err := parentUpper.CreateDirectory(ctx, root, next.name, attrs.Perms); err != nil {
   221  			log.Warningf("copy up failed to create directory: %v", err)
   222  			return syserror.EIO
   223  		}
   224  		childUpper, err := parentUpper.Lookup(ctx, next.name)
   225  		if err != nil {
   226  			werr := fmt.Errorf("copy up failed to lookup directory: %v", err)
   227  			cleanupUpper(ctx, parentUpper, next.name, werr)
   228  			return syserror.EIO
   229  		}
   230  		defer childUpper.DecRef(ctx)
   231  		childUpperInode = childUpper.Inode
   232  
   233  	case Symlink:
   234  		childLower := next.Inode.overlay.lower
   235  		link, err := childLower.Readlink(ctx)
   236  		if err != nil {
   237  			log.Warningf("copy up failed to read symlink value: %v", err)
   238  			return syserror.EIO
   239  		}
   240  		if err := parentUpper.CreateLink(ctx, root, link, next.name); err != nil {
   241  			log.Warningf("copy up failed to create symlink: %v", err)
   242  			return syserror.EIO
   243  		}
   244  		childUpper, err := parentUpper.Lookup(ctx, next.name)
   245  		if err != nil {
   246  			werr := fmt.Errorf("copy up failed to lookup symlink: %v", err)
   247  			cleanupUpper(ctx, parentUpper, next.name, werr)
   248  			return syserror.EIO
   249  		}
   250  		defer childUpper.DecRef(ctx)
   251  		childUpperInode = childUpper.Inode
   252  
   253  	default:
   254  		panic(fmt.Sprintf("copy up of invalid type %v on %+v", next.Inode.StableAttr.Type, next))
   255  	}
   256  
   257  	// Bring file attributes up to date. This does not include size, which will be
   258  	// brought up to date with copyContentsLocked.
   259  	if err := copyAttributesLocked(ctx, childUpperInode, next.Inode.overlay.lower); err != nil {
   260  		werr := fmt.Errorf("copy up failed to copy up attributes: %v", err)
   261  		cleanupUpper(ctx, parentUpper, next.name, werr)
   262  		return syserror.EIO
   263  	}
   264  
   265  	// Copy the entire file.
   266  	if err := copyContentsLocked(ctx, childUpperInode, next.Inode.overlay.lower, attrs.Size); err != nil {
   267  		werr := fmt.Errorf("copy up failed to copy up contents: %v", err)
   268  		cleanupUpper(ctx, parentUpper, next.name, werr)
   269  		return syserror.EIO
   270  	}
   271  
   272  	lowerMappable := next.Inode.overlay.lower.Mappable()
   273  	upperMappable := childUpperInode.Mappable()
   274  	if lowerMappable != nil && upperMappable == nil {
   275  		werr := fmt.Errorf("copy up failed: cannot ensure memory mapping coherence")
   276  		cleanupUpper(ctx, parentUpper, next.name, werr)
   277  		return syserror.EIO
   278  	}
   279  
   280  	// Propagate memory mappings to the upper Inode.
   281  	next.Inode.overlay.mapsMu.Lock()
   282  	defer next.Inode.overlay.mapsMu.Unlock()
   283  	if upperMappable != nil {
   284  		// Remember which mappings we added so we can remove them on failure.
   285  		allAdded := make(map[memmap.MappableRange]memmap.MappingsOfRange)
   286  		for seg := next.Inode.overlay.mappings.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
   287  			added := make(memmap.MappingsOfRange)
   288  			for m := range seg.Value() {
   289  				if err := upperMappable.AddMapping(ctx, m.MappingSpace, m.AddrRange, seg.Start(), m.Writable); err != nil {
   290  					for m := range added {
   291  						upperMappable.RemoveMapping(ctx, m.MappingSpace, m.AddrRange, seg.Start(), m.Writable)
   292  					}
   293  					for mr, mappings := range allAdded {
   294  						for m := range mappings {
   295  							upperMappable.RemoveMapping(ctx, m.MappingSpace, m.AddrRange, mr.Start, m.Writable)
   296  						}
   297  					}
   298  					return err
   299  				}
   300  				added[m] = struct{}{}
   301  			}
   302  			allAdded[seg.Range()] = added
   303  		}
   304  	}
   305  
   306  	// Take a reference on the upper Inode (transferred to
   307  	// next.Inode.overlay.upper) and make new translations use it.
   308  	overlay := next.Inode.overlay
   309  	overlay.dataMu.Lock()
   310  	childUpperInode.IncRef()
   311  	overlay.upper = childUpperInode
   312  	overlay.dataMu.Unlock()
   313  
   314  	// Invalidate existing translations through the lower Inode.
   315  	overlay.mappings.InvalidateAll(memmap.InvalidateOpts{})
   316  
   317  	// Remove existing memory mappings from the lower Inode.
   318  	if lowerMappable != nil {
   319  		for seg := overlay.mappings.FirstSegment(); seg.Ok(); seg = seg.NextSegment() {
   320  			for m := range seg.Value() {
   321  				lowerMappable.RemoveMapping(ctx, m.MappingSpace, m.AddrRange, seg.Start(), m.Writable)
   322  			}
   323  		}
   324  	}
   325  
   326  	return nil
   327  }
   328  
   329  // cleanupUpper is called when copy-up fails. It logs the copy-up error and
   330  // attempts to remove name from parent. If that fails, then it panics.
   331  func cleanupUpper(ctx context.Context, parent *Inode, name string, copyUpErr error) {
   332  	log.Warningf(copyUpErr.Error())
   333  	if err := parent.InodeOperations.Remove(ctx, parent, name); err != nil {
   334  		// Unfortunately we don't have much choice. We shouldn't
   335  		// willingly give the caller access to a nonsense filesystem.
   336  		panic(fmt.Sprintf("overlay filesystem is in an inconsistent state: copyUp got error: %v; then cleanup failed to remove %q from upper filesystem: %v.", copyUpErr, name, err))
   337  	}
   338  }
   339  
   340  // copyUpBuffers is a buffer pool for copying file content. The buffer
   341  // size is the same used by io.Copy.
   342  var copyUpBuffers = sync.Pool{
   343  	New: func() interface{} {
   344  		b := make([]byte, 8*hostarch.PageSize)
   345  		return &b
   346  	},
   347  }
   348  
   349  // copyContentsLocked copies the contents of lower to upper. It panics if
   350  // less than size bytes can be copied.
   351  func copyContentsLocked(ctx context.Context, upper *Inode, lower *Inode, size int64) error {
   352  	// We don't support copying up for anything other than regular files.
   353  	if lower.StableAttr.Type != RegularFile {
   354  		return nil
   355  	}
   356  
   357  	// Get a handle to the upper filesystem, which we will write to.
   358  	upperFile, err := overlayFile(ctx, upper, FileFlags{Write: true})
   359  	if err != nil {
   360  		return err
   361  	}
   362  	defer upperFile.DecRef(ctx)
   363  
   364  	// Get a handle to the lower filesystem, which we will read from.
   365  	lowerFile, err := overlayFile(ctx, lower, FileFlags{Read: true})
   366  	if err != nil {
   367  		return err
   368  	}
   369  	defer lowerFile.DecRef(ctx)
   370  
   371  	// Use a buffer pool to minimize allocations.
   372  	buf := copyUpBuffers.Get().(*[]byte)
   373  	defer copyUpBuffers.Put(buf)
   374  
   375  	// Transfer the contents.
   376  	//
   377  	// One might be able to optimize this by doing parallel reads, parallel writes and reads, larger
   378  	// buffers, etc. But we really don't know anything about the underlying implementation, so these
   379  	// optimizations could be self-defeating. So we leave this as simple as possible.
   380  	var offset int64
   381  	for {
   382  		nr, err := lowerFile.FileOperations.Read(ctx, lowerFile, usermem.BytesIOSequence(*buf), offset)
   383  		if err != nil && err != io.EOF {
   384  			return err
   385  		}
   386  		if nr == 0 {
   387  			if offset != size {
   388  				// Same as in cleanupUpper, we cannot live
   389  				// with ourselves if we do anything less.
   390  				panic(fmt.Sprintf("filesystem is in an inconsistent state: wrote only %d bytes of %d sized file", offset, size))
   391  			}
   392  			return nil
   393  		}
   394  		nw, err := upperFile.FileOperations.Write(ctx, upperFile, usermem.BytesIOSequence((*buf)[:nr]), offset)
   395  		if err != nil {
   396  			return err
   397  		}
   398  		offset += nw
   399  	}
   400  }
   401  
   402  // copyAttributesLocked copies a subset of lower's attributes to upper,
   403  // specifically owner, timestamps (except of status change time), and
   404  // extended attributes. Notably no attempt is made to copy link count.
   405  // Size and permissions are set on upper when the file content is copied
   406  // and when the file is created respectively.
   407  func copyAttributesLocked(ctx context.Context, upper *Inode, lower *Inode) error {
   408  	// Extract attributes from the lower filesystem.
   409  	lowerAttr, err := lower.UnstableAttr(ctx)
   410  	if err != nil {
   411  		return err
   412  	}
   413  	lowerXattr, err := lower.ListXattr(ctx, linux.XATTR_SIZE_MAX)
   414  	if err != nil && !linuxerr.Equals(linuxerr.EOPNOTSUPP, err) {
   415  		return err
   416  	}
   417  
   418  	// Set the attributes on the upper filesystem.
   419  	if err := upper.InodeOperations.SetOwner(ctx, upper, lowerAttr.Owner); err != nil {
   420  		return err
   421  	}
   422  	if err := upper.InodeOperations.SetTimestamps(ctx, upper, TimeSpec{
   423  		ATime: lowerAttr.AccessTime,
   424  		MTime: lowerAttr.ModificationTime,
   425  	}); err != nil {
   426  		return err
   427  	}
   428  	for name := range lowerXattr {
   429  		// Don't copy-up attributes that configure an overlay in the
   430  		// lower.
   431  		if isXattrOverlay(name) {
   432  			continue
   433  		}
   434  		value, err := lower.GetXattr(ctx, name, linux.XATTR_SIZE_MAX)
   435  		if err != nil {
   436  			return err
   437  		}
   438  		if err := upper.InodeOperations.SetXattr(ctx, upper, name, value, 0 /* flags */); err != nil {
   439  			return err
   440  		}
   441  	}
   442  	return nil
   443  }