github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/fsimpl/verity/verity.go (about)

     1  // Copyright 2020 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package verity provides a filesystem implementation that is a wrapper of
    16  // another file system.
    17  // The verity file system provides integrity check for the underlying file
    18  // system by providing verification for path traversals and each read.
    19  // The verity file system is read-only, except for one case: when
    20  // allowRuntimeEnable is true, additional Merkle files can be generated using
    21  // the FS_IOC_ENABLE_VERITY ioctl.
    22  //
    23  // Lock order:
    24  //
    25  // filesystem.renameMu
    26  //   dentry.dirMu
    27  //     fileDescription.mu
    28  //       filesystem.verityMu
    29  //         dentry.hashMu
    30  //
    31  // Locking dentry.dirMu in multiple dentries requires that parent dentries are
    32  // locked before child dentries, and that filesystem.renameMu is locked to
    33  // stabilize this relationship.
    34  package verity
    35  
    36  import (
    37  	"bytes"
    38  	"encoding/hex"
    39  	"encoding/json"
    40  	"fmt"
    41  	"math"
    42  	"sort"
    43  	"strconv"
    44  	"strings"
    45  	"sync/atomic"
    46  
    47  	"github.com/SagerNet/gvisor/pkg/abi/linux"
    48  	"github.com/SagerNet/gvisor/pkg/context"
    49  	"github.com/SagerNet/gvisor/pkg/errors/linuxerr"
    50  	"github.com/SagerNet/gvisor/pkg/fspath"
    51  	"github.com/SagerNet/gvisor/pkg/hostarch"
    52  	"github.com/SagerNet/gvisor/pkg/marshal/primitive"
    53  	"github.com/SagerNet/gvisor/pkg/merkletree"
    54  	"github.com/SagerNet/gvisor/pkg/refsvfs2"
    55  	"github.com/SagerNet/gvisor/pkg/safemem"
    56  	"github.com/SagerNet/gvisor/pkg/sentry/arch"
    57  	fslock "github.com/SagerNet/gvisor/pkg/sentry/fs/lock"
    58  	"github.com/SagerNet/gvisor/pkg/sentry/kernel"
    59  	"github.com/SagerNet/gvisor/pkg/sentry/kernel/auth"
    60  	"github.com/SagerNet/gvisor/pkg/sentry/memmap"
    61  	"github.com/SagerNet/gvisor/pkg/sentry/vfs"
    62  	"github.com/SagerNet/gvisor/pkg/sync"
    63  	"github.com/SagerNet/gvisor/pkg/syserror"
    64  	"github.com/SagerNet/gvisor/pkg/usermem"
    65  )
    66  
    67  const (
    68  	// Name is the default filesystem name.
    69  	Name = "verity"
    70  
    71  	// merklePrefix is the prefix of the Merkle tree files. For example, the Merkle
    72  	// tree file for "/foo" is "/.merkle.verity.foo".
    73  	merklePrefix = ".merkle.verity."
    74  
    75  	// merkleRootPrefix is the prefix of the Merkle tree root file. This
    76  	// needs to be different from merklePrefix to avoid name collision.
    77  	merkleRootPrefix = ".merkleroot.verity."
    78  
    79  	// merkleOffsetInParentXattr is the extended attribute name specifying the
    80  	// offset of the child hash in its parent's Merkle tree.
    81  	merkleOffsetInParentXattr = "user.merkle.offset"
    82  
    83  	// merkleSizeXattr is the extended attribute name specifying the size of data
    84  	// hashed by the corresponding Merkle tree. For a regular file, this is the
    85  	// file size. For a directory, this is the size of all its children's hashes.
    86  	merkleSizeXattr = "user.merkle.size"
    87  
    88  	// childrenOffsetXattr is the extended attribute name specifying the
    89  	// names of the offset of the serialized children names in the Merkle
    90  	// tree file.
    91  	childrenOffsetXattr = "user.merkle.childrenOffset"
    92  
    93  	// childrenSizeXattr is the extended attribute name specifying the size
    94  	// of the serialized children names.
    95  	childrenSizeXattr = "user.merkle.childrenSize"
    96  
    97  	// sizeOfStringInt32 is the size for a 32 bit integer stored as string in
    98  	// extended attributes. The maximum value of a 32 bit integer has 10 digits.
    99  	sizeOfStringInt32 = 10
   100  )
   101  
   102  var (
   103  	// verityMu synchronizes concurrent operations that enable verity and perform
   104  	// verification checks.
   105  	verityMu sync.RWMutex
   106  )
   107  
   108  // Mount option names for verityfs.
   109  const (
   110  	moptLowerPath = "lower_path"
   111  	moptRootHash  = "root_hash"
   112  	moptRootName  = "root_name"
   113  )
   114  
   115  // HashAlgorithm is a type specifying the algorithm used to hash the file
   116  // content.
   117  type HashAlgorithm int
   118  
   119  // ViolationAction is a type specifying the action when an integrity violation
   120  // is detected.
   121  type ViolationAction int
   122  
   123  const (
   124  	// PanicOnViolation terminates the sentry on detected violation.
   125  	PanicOnViolation ViolationAction = 0
   126  	// ErrorOnViolation returns an error from the violating system call on
   127  	// detected violation.
   128  	ErrorOnViolation = 1
   129  )
   130  
   131  // Currently supported hashing algorithms include SHA256 and SHA512.
   132  const (
   133  	SHA256 HashAlgorithm = iota
   134  	SHA512
   135  )
   136  
   137  func (alg HashAlgorithm) toLinuxHashAlg() int {
   138  	switch alg {
   139  	case SHA256:
   140  		return linux.FS_VERITY_HASH_ALG_SHA256
   141  	case SHA512:
   142  		return linux.FS_VERITY_HASH_ALG_SHA512
   143  	default:
   144  		return 0
   145  	}
   146  }
   147  
   148  // FilesystemType implements vfs.FilesystemType.
   149  //
   150  // +stateify savable
   151  type FilesystemType struct{}
   152  
   153  // filesystem implements vfs.FilesystemImpl.
   154  //
   155  // +stateify savable
   156  type filesystem struct {
   157  	vfsfs vfs.Filesystem
   158  
   159  	// creds is a copy of the filesystem's creator's credentials, which are
   160  	// used for accesses to the underlying file system. creds is immutable.
   161  	creds *auth.Credentials
   162  
   163  	// allowRuntimeEnable is true if using ioctl with FS_IOC_ENABLE_VERITY
   164  	// to build Merkle trees in the verity file system is allowed. If this
   165  	// is false, no new Merkle trees can be built, and only the files that
   166  	// had Merkle trees before startup (e.g. from a host filesystem mounted
   167  	// with gofer fs) can be verified.
   168  	allowRuntimeEnable bool
   169  
   170  	// lowerMount is the underlying file system mount.
   171  	lowerMount *vfs.Mount
   172  
   173  	// rootDentry is the mount root Dentry for this file system, which
   174  	// stores the root hash of the whole file system in bytes.
   175  	rootDentry *dentry
   176  
   177  	// alg is the algorithms used to hash the files in the verity file
   178  	// system.
   179  	alg HashAlgorithm
   180  
   181  	// action specifies the action towards detected violation.
   182  	action ViolationAction
   183  
   184  	// opts is the string mount options passed to opts.Data.
   185  	opts string
   186  
   187  	// renameMu synchronizes renaming with non-renaming operations in order
   188  	// to ensure consistent lock ordering between dentry.dirMu in different
   189  	// dentries.
   190  	renameMu sync.RWMutex `state:"nosave"`
   191  
   192  	// verityMu synchronizes enabling verity files, protects files or
   193  	// directories from being enabled by different threads simultaneously.
   194  	// It also ensures that verity does not access files that are being
   195  	// enabled.
   196  	//
   197  	// Also, the directory Merkle trees depends on the generated trees of
   198  	// its children. So they shouldn't be enabled the same time. This lock
   199  	// is for the whole file system to ensure that no more than one file is
   200  	// enabled the same time.
   201  	verityMu sync.RWMutex `state:"nosave"`
   202  }
   203  
   204  // InternalFilesystemOptions may be passed as
   205  // vfs.GetFilesystemOptions.InternalData to FilesystemType.GetFilesystem.
   206  //
   207  // +stateify savable
   208  type InternalFilesystemOptions struct {
   209  	// LowerName is the name of the filesystem wrapped by verity fs.
   210  	LowerName string
   211  
   212  	// Alg is the algorithms used to hash the files in the verity file
   213  	// system.
   214  	Alg HashAlgorithm
   215  
   216  	// AllowRuntimeEnable specifies whether the verity file system allows
   217  	// enabling verification for files (i.e. building Merkle trees) during
   218  	// runtime.
   219  	AllowRuntimeEnable bool
   220  
   221  	// LowerGetFSOptions is the file system option for the lower layer file
   222  	// system wrapped by verity file system.
   223  	LowerGetFSOptions vfs.GetFilesystemOptions
   224  
   225  	// Action specifies the action on an integrity violation.
   226  	Action ViolationAction
   227  }
   228  
   229  // Name implements vfs.FilesystemType.Name.
   230  func (FilesystemType) Name() string {
   231  	return Name
   232  }
   233  
   234  // Release implements vfs.FilesystemType.Release.
   235  func (FilesystemType) Release(ctx context.Context) {}
   236  
   237  // alertIntegrityViolation alerts a violation of integrity, which usually means
   238  // unexpected modification to the file system is detected. In ErrorOnViolation
   239  // mode, it returns EIO, otherwise it panic.
   240  func (fs *filesystem) alertIntegrityViolation(msg string) error {
   241  	if fs.action == ErrorOnViolation {
   242  		return syserror.EIO
   243  	}
   244  	panic(msg)
   245  }
   246  
   247  // GetFilesystem implements vfs.FilesystemType.GetFilesystem.
   248  func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
   249  	mopts := vfs.GenericParseMountOptions(opts.Data)
   250  	var rootHash []byte
   251  	if encodedRootHash, ok := mopts[moptRootHash]; ok {
   252  		delete(mopts, moptRootHash)
   253  		hash, err := hex.DecodeString(encodedRootHash)
   254  		if err != nil {
   255  			ctx.Warningf("verity.FilesystemType.GetFilesystem: Failed to decode root hash: %v", err)
   256  			return nil, nil, linuxerr.EINVAL
   257  		}
   258  		rootHash = hash
   259  	}
   260  	var lowerPathname string
   261  	if path, ok := mopts[moptLowerPath]; ok {
   262  		delete(mopts, moptLowerPath)
   263  		lowerPathname = path
   264  	}
   265  	rootName := "root"
   266  	if root, ok := mopts[moptRootName]; ok {
   267  		delete(mopts, moptRootName)
   268  		rootName = root
   269  	}
   270  
   271  	// Check for unparsed options.
   272  	if len(mopts) != 0 {
   273  		ctx.Warningf("verity.FilesystemType.GetFilesystem: unknown options: %v", mopts)
   274  		return nil, nil, linuxerr.EINVAL
   275  	}
   276  
   277  	// Handle internal options.
   278  	iopts, ok := opts.InternalData.(InternalFilesystemOptions)
   279  	if len(lowerPathname) == 0 && !ok {
   280  		ctx.Warningf("verity.FilesystemType.GetFilesystem: missing verity configs")
   281  		return nil, nil, linuxerr.EINVAL
   282  	}
   283  	if len(lowerPathname) != 0 {
   284  		if ok {
   285  			ctx.Warningf("verity.FilesystemType.GetFilesystem: unexpected verity configs with specified lower path")
   286  			return nil, nil, linuxerr.EINVAL
   287  		}
   288  		iopts = InternalFilesystemOptions{
   289  			AllowRuntimeEnable: len(rootHash) == 0,
   290  			Action:             ErrorOnViolation,
   291  		}
   292  	}
   293  
   294  	var lowerMount *vfs.Mount
   295  	var mountedLowerVD vfs.VirtualDentry
   296  	// Use an existing mount if lowerPath is provided.
   297  	if len(lowerPathname) != 0 {
   298  		vfsroot := vfs.RootFromContext(ctx)
   299  		if vfsroot.Ok() {
   300  			defer vfsroot.DecRef(ctx)
   301  		}
   302  		lowerPath := fspath.Parse(lowerPathname)
   303  		if !lowerPath.Absolute {
   304  			ctx.Infof("verity.FilesystemType.GetFilesystem: lower_path %q must be absolute", lowerPathname)
   305  			return nil, nil, linuxerr.EINVAL
   306  		}
   307  		var err error
   308  		mountedLowerVD, err = vfsObj.GetDentryAt(ctx, creds, &vfs.PathOperation{
   309  			Root:               vfsroot,
   310  			Start:              vfsroot,
   311  			Path:               lowerPath,
   312  			FollowFinalSymlink: true,
   313  		}, &vfs.GetDentryOptions{
   314  			CheckSearchable: true,
   315  		})
   316  		if err != nil {
   317  			ctx.Infof("verity.FilesystemType.GetFilesystem: failed to resolve lower_path %q: %v", lowerPathname, err)
   318  			return nil, nil, err
   319  		}
   320  		lowerMount = mountedLowerVD.Mount()
   321  		defer mountedLowerVD.DecRef(ctx)
   322  	} else {
   323  		// Mount the lower file system. The lower file system is wrapped inside
   324  		// verity, and should not be exposed or connected.
   325  		mountOpts := &vfs.MountOptions{
   326  			GetFilesystemOptions: iopts.LowerGetFSOptions,
   327  			InternalMount:        true,
   328  		}
   329  		mnt, err := vfsObj.MountDisconnected(ctx, creds, "", iopts.LowerName, mountOpts)
   330  		if err != nil {
   331  			return nil, nil, err
   332  		}
   333  		lowerMount = mnt
   334  	}
   335  
   336  	fs := &filesystem{
   337  		creds:              creds.Fork(),
   338  		alg:                iopts.Alg,
   339  		lowerMount:         lowerMount,
   340  		action:             iopts.Action,
   341  		opts:               opts.Data,
   342  		allowRuntimeEnable: iopts.AllowRuntimeEnable,
   343  	}
   344  	fs.vfsfs.Init(vfsObj, &fstype, fs)
   345  
   346  	// Construct the root dentry.
   347  	d := fs.newDentry()
   348  	d.refs = 1
   349  	lowerVD := vfs.MakeVirtualDentry(lowerMount, lowerMount.Root())
   350  	lowerVD.IncRef()
   351  	d.lowerVD = lowerVD
   352  
   353  	rootMerkleName := merkleRootPrefix + rootName
   354  
   355  	lowerMerkleVD, err := vfsObj.GetDentryAt(ctx, fs.creds, &vfs.PathOperation{
   356  		Root:  lowerVD,
   357  		Start: lowerVD,
   358  		Path:  fspath.Parse(rootMerkleName),
   359  	}, &vfs.GetDentryOptions{})
   360  
   361  	// If runtime enable is allowed, the root merkle tree may be absent. We
   362  	// should create the tree file.
   363  	if linuxerr.Equals(linuxerr.ENOENT, err) && fs.allowRuntimeEnable {
   364  		lowerMerkleFD, err := vfsObj.OpenAt(ctx, fs.creds, &vfs.PathOperation{
   365  			Root:  lowerVD,
   366  			Start: lowerVD,
   367  			Path:  fspath.Parse(rootMerkleName),
   368  		}, &vfs.OpenOptions{
   369  			Flags: linux.O_RDWR | linux.O_CREAT,
   370  			Mode:  0644,
   371  		})
   372  		if err != nil {
   373  			fs.vfsfs.DecRef(ctx)
   374  			d.DecRef(ctx)
   375  			return nil, nil, err
   376  		}
   377  		lowerMerkleFD.DecRef(ctx)
   378  		lowerMerkleVD, err = vfsObj.GetDentryAt(ctx, fs.creds, &vfs.PathOperation{
   379  			Root:  lowerVD,
   380  			Start: lowerVD,
   381  			Path:  fspath.Parse(rootMerkleName),
   382  		}, &vfs.GetDentryOptions{})
   383  		if err != nil {
   384  			fs.vfsfs.DecRef(ctx)
   385  			d.DecRef(ctx)
   386  			return nil, nil, err
   387  		}
   388  	} else if err != nil {
   389  		// Failed to get dentry for the root Merkle file. This
   390  		// indicates an unexpected modification that removed/renamed
   391  		// the root Merkle file, or it's never generated.
   392  		fs.vfsfs.DecRef(ctx)
   393  		d.DecRef(ctx)
   394  		return nil, nil, fs.alertIntegrityViolation("Failed to find root Merkle file")
   395  	}
   396  
   397  	// Clear the Merkle tree file if they are to be generated at runtime.
   398  	// TODO(b/182315468): Optimize the Merkle tree generate process to
   399  	// allow only updating certain files/directories.
   400  	if fs.allowRuntimeEnable {
   401  		lowerMerkleFD, err := vfsObj.OpenAt(ctx, fs.creds, &vfs.PathOperation{
   402  			Root:  lowerMerkleVD,
   403  			Start: lowerMerkleVD,
   404  		}, &vfs.OpenOptions{
   405  			Flags: linux.O_RDWR | linux.O_TRUNC,
   406  			Mode:  0644,
   407  		})
   408  		if err != nil {
   409  			return nil, nil, err
   410  		}
   411  		lowerMerkleFD.DecRef(ctx)
   412  	}
   413  
   414  	d.lowerMerkleVD = lowerMerkleVD
   415  
   416  	// Get metadata from the underlying file system.
   417  	const statMask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID
   418  	stat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
   419  		Root:  lowerVD,
   420  		Start: lowerVD,
   421  	}, &vfs.StatOptions{
   422  		Mask: statMask,
   423  	})
   424  	if err != nil {
   425  		fs.vfsfs.DecRef(ctx)
   426  		d.DecRef(ctx)
   427  		return nil, nil, err
   428  	}
   429  
   430  	d.mode = uint32(stat.Mode)
   431  	d.uid = stat.UID
   432  	d.gid = stat.GID
   433  	d.childrenNames = make(map[string]struct{})
   434  
   435  	d.hashMu.Lock()
   436  	d.hash = make([]byte, len(rootHash))
   437  	copy(d.hash, rootHash)
   438  	d.hashMu.Unlock()
   439  
   440  	fs.rootDentry = d
   441  
   442  	if !d.isDir() {
   443  		ctx.Warningf("verity root must be a directory")
   444  		return nil, nil, linuxerr.EINVAL
   445  	}
   446  
   447  	if !fs.allowRuntimeEnable {
   448  		// Get children names from the underlying file system.
   449  		offString, err := vfsObj.GetXattrAt(ctx, creds, &vfs.PathOperation{
   450  			Root:  lowerMerkleVD,
   451  			Start: lowerMerkleVD,
   452  		}, &vfs.GetXattrOptions{
   453  			Name: childrenOffsetXattr,
   454  			Size: sizeOfStringInt32,
   455  		})
   456  		if linuxerr.Equals(linuxerr.ENOENT, err) || linuxerr.Equals(linuxerr.ENODATA, err) {
   457  			return nil, nil, fs.alertIntegrityViolation(fmt.Sprintf("Failed to get xattr %s: %v", childrenOffsetXattr, err))
   458  		}
   459  		if err != nil {
   460  			return nil, nil, err
   461  		}
   462  
   463  		off, err := strconv.Atoi(offString)
   464  		if err != nil {
   465  			return nil, nil, fs.alertIntegrityViolation(fmt.Sprintf("Failed to convert xattr %s to int: %v", childrenOffsetXattr, err))
   466  		}
   467  
   468  		sizeString, err := vfsObj.GetXattrAt(ctx, creds, &vfs.PathOperation{
   469  			Root:  lowerMerkleVD,
   470  			Start: lowerMerkleVD,
   471  		}, &vfs.GetXattrOptions{
   472  			Name: childrenSizeXattr,
   473  			Size: sizeOfStringInt32,
   474  		})
   475  		if linuxerr.Equals(linuxerr.ENOENT, err) || linuxerr.Equals(linuxerr.ENODATA, err) {
   476  			return nil, nil, fs.alertIntegrityViolation(fmt.Sprintf("Failed to get xattr %s: %v", childrenSizeXattr, err))
   477  		}
   478  		if err != nil {
   479  			return nil, nil, err
   480  		}
   481  		size, err := strconv.Atoi(sizeString)
   482  		if err != nil {
   483  			return nil, nil, fs.alertIntegrityViolation(fmt.Sprintf("Failed to convert xattr %s to int: %v", childrenSizeXattr, err))
   484  		}
   485  
   486  		lowerMerkleFD, err := vfsObj.OpenAt(ctx, fs.creds, &vfs.PathOperation{
   487  			Root:  lowerMerkleVD,
   488  			Start: lowerMerkleVD,
   489  		}, &vfs.OpenOptions{
   490  			Flags: linux.O_RDONLY,
   491  		})
   492  		if linuxerr.Equals(linuxerr.ENOENT, err) {
   493  			return nil, nil, fs.alertIntegrityViolation(fmt.Sprintf("Failed to open root Merkle file: %v", err))
   494  		}
   495  		if err != nil {
   496  			return nil, nil, err
   497  		}
   498  
   499  		defer lowerMerkleFD.DecRef(ctx)
   500  
   501  		childrenNames := make([]byte, size)
   502  		if _, err := lowerMerkleFD.PRead(ctx, usermem.BytesIOSequence(childrenNames), int64(off), vfs.ReadOptions{}); err != nil {
   503  			return nil, nil, fs.alertIntegrityViolation(fmt.Sprintf("Failed to read root children map: %v", err))
   504  		}
   505  
   506  		if err := json.Unmarshal(childrenNames, &d.childrenNames); err != nil {
   507  			return nil, nil, fs.alertIntegrityViolation(fmt.Sprintf("Failed to deserialize childrenNames: %v", err))
   508  		}
   509  
   510  		if err := fs.verifyStatAndChildrenLocked(ctx, d, stat); err != nil {
   511  			return nil, nil, err
   512  		}
   513  		d.generateChildrenList()
   514  	}
   515  
   516  	d.vfsd.Init(d)
   517  
   518  	return &fs.vfsfs, &d.vfsd, nil
   519  }
   520  
   521  // Release implements vfs.FilesystemImpl.Release.
   522  func (fs *filesystem) Release(ctx context.Context) {
   523  	fs.lowerMount.DecRef(ctx)
   524  }
   525  
   526  // MountOptions implements vfs.FilesystemImpl.MountOptions.
   527  func (fs *filesystem) MountOptions() string {
   528  	return fs.opts
   529  }
   530  
   531  // dentry implements vfs.DentryImpl.
   532  //
   533  // +stateify savable
   534  type dentry struct {
   535  	vfsd vfs.Dentry
   536  
   537  	refs int64
   538  
   539  	// fs is the owning filesystem. fs is immutable.
   540  	fs *filesystem
   541  
   542  	// mode, uid, gid and size are the file mode, owner, group, and size of
   543  	// the file in the underlying file system. They are set when a dentry
   544  	// is initialized, and never modified.
   545  	mode uint32
   546  	uid  uint32
   547  	gid  uint32
   548  	size uint32
   549  
   550  	// parent is the dentry corresponding to this dentry's parent directory.
   551  	// name is this dentry's name in parent. If this dentry is a filesystem
   552  	// root, parent is nil and name is the empty string. parent and name are
   553  	// protected by fs.renameMu.
   554  	parent *dentry
   555  	name   string
   556  
   557  	// If this dentry represents a directory, children maps the names of
   558  	// children for which dentries have been instantiated to those dentries,
   559  	// and dirents (if not nil) is a cache of dirents as returned by
   560  	// directoryFDs representing this directory. children is protected by
   561  	// dirMu.
   562  	dirMu    sync.Mutex `state:"nosave"`
   563  	children map[string]*dentry
   564  
   565  	// childrenNames stores the name of all children of the dentry. This is
   566  	// used by verity to check whether a child is expected. This is only
   567  	// populated by enableVerity. childrenNames is also protected by dirMu.
   568  	childrenNames map[string]struct{}
   569  
   570  	// childrenList is a complete sorted list of childrenNames. This list
   571  	// is generated when verity is enabled, or the first time the file is
   572  	// verified in non runtime enable mode.
   573  	childrenList []string
   574  
   575  	// lowerVD is the VirtualDentry in the underlying file system. It is
   576  	// never modified after initialized.
   577  	lowerVD vfs.VirtualDentry
   578  
   579  	// lowerMerkleVD is the VirtualDentry of the corresponding Merkle tree
   580  	// in the underlying file system. It is never modified after
   581  	// initialized.
   582  	lowerMerkleVD vfs.VirtualDentry
   583  
   584  	// symlinkTarget is the target path of a symlink file in the underlying filesystem.
   585  	symlinkTarget string
   586  
   587  	// hash is the calculated hash for the current file or directory. hash
   588  	// is protected by hashMu.
   589  	hashMu sync.RWMutex `state:"nosave"`
   590  	hash   []byte
   591  }
   592  
   593  // newDentry creates a new dentry representing the given verity file. The
   594  // dentry initially has no references; it is the caller's responsibility to set
   595  // the dentry's reference count and/or call dentry.destroy() as appropriate.
   596  // The dentry is initially invalid in that it contains no underlying dentry;
   597  // the caller is responsible for setting them.
   598  func (fs *filesystem) newDentry() *dentry {
   599  	d := &dentry{
   600  		fs: fs,
   601  	}
   602  	d.vfsd.Init(d)
   603  	refsvfs2.Register(d)
   604  	return d
   605  }
   606  
   607  // IncRef implements vfs.DentryImpl.IncRef.
   608  func (d *dentry) IncRef() {
   609  	r := atomic.AddInt64(&d.refs, 1)
   610  	if d.LogRefs() {
   611  		refsvfs2.LogIncRef(d, r)
   612  	}
   613  }
   614  
   615  // TryIncRef implements vfs.DentryImpl.TryIncRef.
   616  func (d *dentry) TryIncRef() bool {
   617  	for {
   618  		r := atomic.LoadInt64(&d.refs)
   619  		if r <= 0 {
   620  			return false
   621  		}
   622  		if atomic.CompareAndSwapInt64(&d.refs, r, r+1) {
   623  			if d.LogRefs() {
   624  				refsvfs2.LogTryIncRef(d, r+1)
   625  			}
   626  			return true
   627  		}
   628  	}
   629  }
   630  
   631  // DecRef implements vfs.DentryImpl.DecRef.
   632  func (d *dentry) DecRef(ctx context.Context) {
   633  	r := atomic.AddInt64(&d.refs, -1)
   634  	if d.LogRefs() {
   635  		refsvfs2.LogDecRef(d, r)
   636  	}
   637  	if r == 0 {
   638  		d.fs.renameMu.Lock()
   639  		d.checkDropLocked(ctx)
   640  		d.fs.renameMu.Unlock()
   641  	} else if r < 0 {
   642  		panic("verity.dentry.DecRef() called without holding a reference")
   643  	}
   644  }
   645  
   646  func (d *dentry) decRefLocked(ctx context.Context) {
   647  	r := atomic.AddInt64(&d.refs, -1)
   648  	if d.LogRefs() {
   649  		refsvfs2.LogDecRef(d, r)
   650  	}
   651  	if r == 0 {
   652  		d.checkDropLocked(ctx)
   653  	} else if r < 0 {
   654  		panic("verity.dentry.decRefLocked() called without holding a reference")
   655  	}
   656  }
   657  
   658  // checkDropLocked should be called after d's reference count becomes 0 or it
   659  // becomes deleted.
   660  func (d *dentry) checkDropLocked(ctx context.Context) {
   661  	// Dentries with a positive reference count must be retained. Dentries
   662  	// with a negative reference count have already been destroyed.
   663  	if atomic.LoadInt64(&d.refs) != 0 {
   664  		return
   665  	}
   666  	// Refs is still zero; destroy it.
   667  	d.destroyLocked(ctx)
   668  	return
   669  }
   670  
   671  // destroyLocked destroys the dentry.
   672  //
   673  // Preconditions:
   674  // * d.fs.renameMu must be locked for writing.
   675  // * d.refs == 0.
   676  func (d *dentry) destroyLocked(ctx context.Context) {
   677  	switch atomic.LoadInt64(&d.refs) {
   678  	case 0:
   679  		// Mark the dentry destroyed.
   680  		atomic.StoreInt64(&d.refs, -1)
   681  	case -1:
   682  		panic("verity.dentry.destroyLocked() called on already destroyed dentry")
   683  	default:
   684  		panic("verity.dentry.destroyLocked() called with references on the dentry")
   685  	}
   686  
   687  	if d.lowerVD.Ok() {
   688  		d.lowerVD.DecRef(ctx)
   689  	}
   690  	if d.lowerMerkleVD.Ok() {
   691  		d.lowerMerkleVD.DecRef(ctx)
   692  	}
   693  	if d.parent != nil {
   694  		d.parent.dirMu.Lock()
   695  		if !d.vfsd.IsDead() {
   696  			delete(d.parent.children, d.name)
   697  		}
   698  		d.parent.dirMu.Unlock()
   699  		d.parent.decRefLocked(ctx)
   700  	}
   701  	refsvfs2.Unregister(d)
   702  }
   703  
   704  // RefType implements refsvfs2.CheckedObject.Type.
   705  func (d *dentry) RefType() string {
   706  	return "verity.dentry"
   707  }
   708  
   709  // LeakMessage implements refsvfs2.CheckedObject.LeakMessage.
   710  func (d *dentry) LeakMessage() string {
   711  	return fmt.Sprintf("[verity.dentry %p] reference count of %d instead of -1", d, atomic.LoadInt64(&d.refs))
   712  }
   713  
   714  // LogRefs implements refsvfs2.CheckedObject.LogRefs.
   715  //
   716  // This should only be set to true for debugging purposes, as it can generate an
   717  // extremely large amount of output and drastically degrade performance.
   718  func (d *dentry) LogRefs() bool {
   719  	return false
   720  }
   721  
   722  // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
   723  func (d *dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) {
   724  	//TODO(b/159261227): Implement InotifyWithParent.
   725  }
   726  
   727  // Watches implements vfs.DentryImpl.Watches.
   728  func (d *dentry) Watches() *vfs.Watches {
   729  	//TODO(b/159261227): Implement Watches.
   730  	return nil
   731  }
   732  
   733  // OnZeroWatches implements vfs.DentryImpl.OnZeroWatches.
   734  func (d *dentry) OnZeroWatches(context.Context) {
   735  	//TODO(b/159261227): Implement OnZeroWatches.
   736  }
   737  
   738  func (d *dentry) isSymlink() bool {
   739  	return atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFLNK
   740  }
   741  
   742  func (d *dentry) isDir() bool {
   743  	return atomic.LoadUint32(&d.mode)&linux.S_IFMT == linux.S_IFDIR
   744  }
   745  
   746  func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error {
   747  	return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid)))
   748  }
   749  
   750  // verityEnabled checks whether the file is enabled with verity features. It
   751  // should always be true if runtime enable is not allowed. In runtime enable
   752  // mode, it returns true if the target has been enabled with
   753  // ioctl(FS_IOC_ENABLE_VERITY).
   754  func (d *dentry) verityEnabled() bool {
   755  	d.hashMu.RLock()
   756  	defer d.hashMu.RUnlock()
   757  	return !d.fs.allowRuntimeEnable || len(d.hash) != 0
   758  }
   759  
   760  // generateChildrenList generates a sorted childrenList from childrenNames, and
   761  // cache it in d for hashing.
   762  func (d *dentry) generateChildrenList() {
   763  	if len(d.childrenList) == 0 && len(d.childrenNames) != 0 {
   764  		for child := range d.childrenNames {
   765  			d.childrenList = append(d.childrenList, child)
   766  		}
   767  		sort.Strings(d.childrenList)
   768  	}
   769  }
   770  
   771  // getLowerAt returns the dentry in the underlying file system, which is
   772  // represented by filename relative to d.
   773  func (d *dentry) getLowerAt(ctx context.Context, vfsObj *vfs.VirtualFilesystem, filename string) (vfs.VirtualDentry, error) {
   774  	return vfsObj.GetDentryAt(ctx, d.fs.creds, &vfs.PathOperation{
   775  		Root:  d.lowerVD,
   776  		Start: d.lowerVD,
   777  		Path:  fspath.Parse(filename),
   778  	}, &vfs.GetDentryOptions{})
   779  }
   780  
   781  func (d *dentry) readlink(ctx context.Context) (string, error) {
   782  	vfsObj := d.fs.vfsfs.VirtualFilesystem()
   783  	if d.verityEnabled() {
   784  		stat, err := vfsObj.StatAt(ctx, d.fs.creds, &vfs.PathOperation{
   785  			Root:  d.lowerVD,
   786  			Start: d.lowerVD,
   787  		}, &vfs.StatOptions{})
   788  		if err != nil {
   789  			return "", err
   790  		}
   791  		d.dirMu.Lock()
   792  		defer d.dirMu.Unlock()
   793  		if err := d.fs.verifyStatAndChildrenLocked(ctx, d, stat); err != nil {
   794  			return "", err
   795  		}
   796  		return d.symlinkTarget, nil
   797  	}
   798  
   799  	return d.fs.vfsfs.VirtualFilesystem().ReadlinkAt(ctx, d.fs.creds, &vfs.PathOperation{
   800  		Root:  d.lowerVD,
   801  		Start: d.lowerVD,
   802  	})
   803  }
   804  
   805  // FileDescription implements vfs.FileDescriptionImpl for verity fds.
   806  // FileDescription is a wrapper of the underlying lowerFD, with support to build
   807  // Merkle trees through the Linux fs-verity API to verify contents read from
   808  // lowerFD.
   809  //
   810  // +stateify savable
   811  type fileDescription struct {
   812  	vfsfd vfs.FileDescription
   813  	vfs.FileDescriptionDefaultImpl
   814  
   815  	// d is the corresponding dentry to the fileDescription.
   816  	d *dentry
   817  
   818  	// isDir specifies whehter the fileDescription points to a directory.
   819  	isDir bool
   820  
   821  	// lowerFD is the FileDescription corresponding to the file in the
   822  	// underlying file system.
   823  	lowerFD *vfs.FileDescription
   824  
   825  	// lowerMappable is the memmap.Mappable corresponding to this file in the
   826  	// underlying file system.
   827  	lowerMappable memmap.Mappable
   828  
   829  	// merkleReader is the read-only FileDescription corresponding to the
   830  	// Merkle tree file in the underlying file system.
   831  	merkleReader *vfs.FileDescription
   832  
   833  	// merkleWriter is the FileDescription corresponding to the Merkle tree
   834  	// file in the underlying file system for writing. This should only be
   835  	// used when allowRuntimeEnable is set to true.
   836  	merkleWriter *vfs.FileDescription
   837  
   838  	// parentMerkleWriter is the FileDescription of the Merkle tree for the
   839  	// directory that contains the current file/directory. This is only used
   840  	// if allowRuntimeEnable is set to true.
   841  	parentMerkleWriter *vfs.FileDescription
   842  
   843  	// off is the file offset. off is protected by mu.
   844  	mu  sync.Mutex `state:"nosave"`
   845  	off int64
   846  }
   847  
   848  // Release implements vfs.FileDescriptionImpl.Release.
   849  func (fd *fileDescription) Release(ctx context.Context) {
   850  	fd.lowerFD.DecRef(ctx)
   851  	fd.merkleReader.DecRef(ctx)
   852  	if fd.merkleWriter != nil {
   853  		fd.merkleWriter.DecRef(ctx)
   854  	}
   855  	if fd.parentMerkleWriter != nil {
   856  		fd.parentMerkleWriter.DecRef(ctx)
   857  	}
   858  }
   859  
   860  // Stat implements vfs.FileDescriptionImpl.Stat.
   861  func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
   862  	stat, err := fd.lowerFD.Stat(ctx, opts)
   863  	if err != nil {
   864  		return linux.Statx{}, err
   865  	}
   866  	fd.d.dirMu.Lock()
   867  	if fd.d.verityEnabled() {
   868  		if err := fd.d.fs.verifyStatAndChildrenLocked(ctx, fd.d, stat); err != nil {
   869  			return linux.Statx{}, err
   870  		}
   871  	}
   872  	fd.d.dirMu.Unlock()
   873  	return stat, nil
   874  }
   875  
   876  // SetStat implements vfs.FileDescriptionImpl.SetStat.
   877  func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
   878  	// Verity files are read-only.
   879  	return linuxerr.EPERM
   880  }
   881  
   882  // IterDirents implements vfs.FileDescriptionImpl.IterDirents.
   883  func (fd *fileDescription) IterDirents(ctx context.Context, cb vfs.IterDirentsCallback) error {
   884  	if !fd.d.isDir() {
   885  		return syserror.ENOTDIR
   886  	}
   887  	fd.mu.Lock()
   888  	defer fd.mu.Unlock()
   889  
   890  	if _, err := fd.lowerFD.Seek(ctx, fd.off, linux.SEEK_SET); err != nil {
   891  		return err
   892  	}
   893  
   894  	var ds []vfs.Dirent
   895  	err := fd.lowerFD.IterDirents(ctx, vfs.IterDirentsCallbackFunc(func(dirent vfs.Dirent) error {
   896  		// Do not include the Merkle tree files.
   897  		if strings.Contains(dirent.Name, merklePrefix) || strings.Contains(dirent.Name, merkleRootPrefix) {
   898  			return nil
   899  		}
   900  		if fd.d.verityEnabled() {
   901  			// Verify that the child is expected.
   902  			if dirent.Name != "." && dirent.Name != ".." {
   903  				if _, ok := fd.d.childrenNames[dirent.Name]; !ok {
   904  					return fd.d.fs.alertIntegrityViolation(fmt.Sprintf("Unexpected children %s", dirent.Name))
   905  				}
   906  			}
   907  		}
   908  		ds = append(ds, dirent)
   909  		return nil
   910  	}))
   911  
   912  	if err != nil {
   913  		return err
   914  	}
   915  
   916  	// The result should be a part of all children plus "." and "..", counting from fd.off.
   917  	if fd.d.verityEnabled() && len(ds) != len(fd.d.childrenNames)+2-int(fd.off) {
   918  		return fd.d.fs.alertIntegrityViolation(fmt.Sprintf("Unexpected children number %d", len(ds)))
   919  	}
   920  
   921  	for fd.off < int64(len(ds)) {
   922  		if err := cb.Handle(ds[fd.off]); err != nil {
   923  			return err
   924  		}
   925  		fd.off++
   926  	}
   927  	return nil
   928  }
   929  
   930  // Seek implements vfs.FileDescriptionImpl.Seek.
   931  func (fd *fileDescription) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
   932  	fd.mu.Lock()
   933  	defer fd.mu.Unlock()
   934  	n := int64(0)
   935  	switch whence {
   936  	case linux.SEEK_SET:
   937  		// use offset as specified
   938  	case linux.SEEK_CUR:
   939  		n = fd.off
   940  	case linux.SEEK_END:
   941  		n = int64(fd.d.size)
   942  	default:
   943  		return 0, linuxerr.EINVAL
   944  	}
   945  	if offset > math.MaxInt64-n {
   946  		return 0, linuxerr.EINVAL
   947  	}
   948  	offset += n
   949  	if offset < 0 {
   950  		return 0, linuxerr.EINVAL
   951  	}
   952  	fd.off = offset
   953  	return offset, nil
   954  }
   955  
   956  // generateMerkleLocked generates a Merkle tree file for fd. If fd points to a
   957  // file /foo/bar, a Merkle tree file /foo/.merkle.verity.bar is generated. The
   958  // hash of the generated Merkle tree and the data size is returned.  If fd
   959  // points to a regular file, the data is the content of the file. If fd points
   960  // to a directory, the data is all hashes of its children, written to the Merkle
   961  // tree file. If fd represents a symlink, the data is empty and nothing is written
   962  // to the Merkle tree file.
   963  //
   964  // Preconditions: fd.d.fs.verityMu must be locked.
   965  func (fd *fileDescription) generateMerkleLocked(ctx context.Context) ([]byte, uint64, error) {
   966  	fdReader := FileReadWriteSeeker{
   967  		FD:  fd.lowerFD,
   968  		Ctx: ctx,
   969  	}
   970  	merkleReader := FileReadWriteSeeker{
   971  		FD:  fd.merkleReader,
   972  		Ctx: ctx,
   973  	}
   974  	merkleWriter := FileReadWriteSeeker{
   975  		FD:  fd.merkleWriter,
   976  		Ctx: ctx,
   977  	}
   978  
   979  	stat, err := fd.lowerFD.Stat(ctx, vfs.StatOptions{})
   980  	if err != nil {
   981  		return nil, 0, err
   982  	}
   983  
   984  	fd.d.generateChildrenList()
   985  
   986  	params := &merkletree.GenerateParams{
   987  		TreeReader:     &merkleReader,
   988  		TreeWriter:     &merkleWriter,
   989  		Children:       fd.d.childrenList,
   990  		HashAlgorithms: fd.d.fs.alg.toLinuxHashAlg(),
   991  		Name:           fd.d.name,
   992  		Mode:           uint32(stat.Mode),
   993  		UID:            stat.UID,
   994  		GID:            stat.GID,
   995  	}
   996  
   997  	switch atomic.LoadUint32(&fd.d.mode) & linux.S_IFMT {
   998  	case linux.S_IFREG:
   999  		// For a regular file, generate a Merkle tree based on its
  1000  		// content.
  1001  		params.File = &fdReader
  1002  		params.Size = int64(stat.Size)
  1003  		params.DataAndTreeInSameFile = false
  1004  	case linux.S_IFDIR:
  1005  		// For a directory, generate a Merkle tree based on the hashes
  1006  		// of its children that has already been written to the Merkle
  1007  		// tree file.
  1008  		merkleStat, err := fd.merkleReader.Stat(ctx, vfs.StatOptions{})
  1009  		if err != nil {
  1010  			return nil, 0, err
  1011  		}
  1012  
  1013  		params.Size = int64(merkleStat.Size)
  1014  		params.File = &merkleReader
  1015  		params.DataAndTreeInSameFile = true
  1016  	case linux.S_IFLNK:
  1017  		// For a symlink, generate a Merkle tree file but do not write the root hash
  1018  		// of the target file content to it. Return a hash of a VerityDescriptor object
  1019  		// which includes the symlink target name.
  1020  		target, err := fd.d.readlink(ctx)
  1021  		if err != nil {
  1022  			return nil, 0, err
  1023  		}
  1024  
  1025  		params.Size = int64(stat.Size)
  1026  		params.DataAndTreeInSameFile = false
  1027  		params.SymlinkTarget = target
  1028  	default:
  1029  		// TODO(b/167728857): Investigate whether and how we should
  1030  		// enable other types of file.
  1031  		return nil, 0, linuxerr.EINVAL
  1032  	}
  1033  	hash, err := merkletree.Generate(params)
  1034  	return hash, uint64(params.Size), err
  1035  }
  1036  
  1037  // recordChildrenLocked writes the names of fd's children into the
  1038  // corresponding Merkle tree file, and saves the offset/size of the map into
  1039  // xattrs.
  1040  //
  1041  // Preconditions:
  1042  // * fd.d.fs.verityMu must be locked.
  1043  // * fd.d.isDir() == true.
  1044  func (fd *fileDescription) recordChildrenLocked(ctx context.Context) error {
  1045  	// Record the children names in the Merkle tree file.
  1046  	childrenNames, err := json.Marshal(fd.d.childrenNames)
  1047  	if err != nil {
  1048  		return err
  1049  	}
  1050  
  1051  	stat, err := fd.merkleWriter.Stat(ctx, vfs.StatOptions{})
  1052  	if err != nil {
  1053  		return err
  1054  	}
  1055  
  1056  	if err := fd.merkleWriter.SetXattr(ctx, &vfs.SetXattrOptions{
  1057  		Name:  childrenOffsetXattr,
  1058  		Value: strconv.Itoa(int(stat.Size)),
  1059  	}); err != nil {
  1060  		return err
  1061  	}
  1062  	if err := fd.merkleWriter.SetXattr(ctx, &vfs.SetXattrOptions{
  1063  		Name:  childrenSizeXattr,
  1064  		Value: strconv.Itoa(len(childrenNames)),
  1065  	}); err != nil {
  1066  		return err
  1067  	}
  1068  
  1069  	if _, err = fd.merkleWriter.Write(ctx, usermem.BytesIOSequence(childrenNames), vfs.WriteOptions{}); err != nil {
  1070  		return err
  1071  	}
  1072  
  1073  	return nil
  1074  }
  1075  
  1076  // enableVerity enables verity features on fd by generating a Merkle tree file
  1077  // and stores its hash in its parent directory's Merkle tree.
  1078  func (fd *fileDescription) enableVerity(ctx context.Context) (uintptr, error) {
  1079  	if !fd.d.fs.allowRuntimeEnable {
  1080  		return 0, linuxerr.EPERM
  1081  	}
  1082  
  1083  	fd.d.fs.verityMu.Lock()
  1084  	defer fd.d.fs.verityMu.Unlock()
  1085  
  1086  	// In allowRuntimeEnable mode, the underlying fd and read/write fd for
  1087  	// the Merkle tree file should have all been initialized. For any file
  1088  	// or directory other than the root, the parent Merkle tree file should
  1089  	// have also been initialized.
  1090  	if fd.lowerFD == nil || fd.merkleReader == nil || fd.merkleWriter == nil || (fd.parentMerkleWriter == nil && fd.d != fd.d.fs.rootDentry) {
  1091  		return 0, fd.d.fs.alertIntegrityViolation("Unexpected verity fd: missing expected underlying fds")
  1092  	}
  1093  
  1094  	hash, dataSize, err := fd.generateMerkleLocked(ctx)
  1095  	if err != nil {
  1096  		return 0, err
  1097  	}
  1098  
  1099  	if fd.parentMerkleWriter != nil {
  1100  		stat, err := fd.parentMerkleWriter.Stat(ctx, vfs.StatOptions{})
  1101  		if err != nil {
  1102  			return 0, err
  1103  		}
  1104  
  1105  		// Write the hash of fd to the parent directory's Merkle tree
  1106  		// file, as it should be part of the parent Merkle tree data.
  1107  		// parentMerkleWriter is open with O_APPEND, so it should write
  1108  		// directly to the end of the file.
  1109  		if _, err = fd.parentMerkleWriter.Write(ctx, usermem.BytesIOSequence(hash), vfs.WriteOptions{}); err != nil {
  1110  			return 0, err
  1111  		}
  1112  
  1113  		// Record the offset of the hash of fd in parent directory's
  1114  		// Merkle tree file.
  1115  		if err := fd.merkleWriter.SetXattr(ctx, &vfs.SetXattrOptions{
  1116  			Name:  merkleOffsetInParentXattr,
  1117  			Value: strconv.Itoa(int(stat.Size)),
  1118  		}); err != nil {
  1119  			return 0, err
  1120  		}
  1121  
  1122  		// Add the current child's name to parent's childrenNames.
  1123  		fd.d.parent.childrenNames[fd.d.name] = struct{}{}
  1124  	}
  1125  
  1126  	// Record the size of the data being hashed for fd.
  1127  	if err := fd.merkleWriter.SetXattr(ctx, &vfs.SetXattrOptions{
  1128  		Name:  merkleSizeXattr,
  1129  		Value: strconv.Itoa(int(dataSize)),
  1130  	}); err != nil {
  1131  		return 0, err
  1132  	}
  1133  
  1134  	if fd.d.isDir() {
  1135  		if err := fd.recordChildrenLocked(ctx); err != nil {
  1136  			return 0, err
  1137  		}
  1138  	}
  1139  	fd.d.hashMu.Lock()
  1140  	fd.d.hash = hash
  1141  	fd.d.hashMu.Unlock()
  1142  	return 0, nil
  1143  }
  1144  
  1145  // measureVerity returns the hash of fd, saved in verityDigest.
  1146  func (fd *fileDescription) measureVerity(ctx context.Context, verityDigest hostarch.Addr) (uintptr, error) {
  1147  	t := kernel.TaskFromContext(ctx)
  1148  	if t == nil {
  1149  		return 0, linuxerr.EINVAL
  1150  	}
  1151  	var metadata linux.DigestMetadata
  1152  
  1153  	fd.d.hashMu.RLock()
  1154  	defer fd.d.hashMu.RUnlock()
  1155  
  1156  	// If allowRuntimeEnable is true, an empty fd.d.hash indicates that
  1157  	// verity is not enabled for the file. If allowRuntimeEnable is false,
  1158  	// this is an integrity violation because all files should have verity
  1159  	// enabled, in which case fd.d.hash should be set.
  1160  	if len(fd.d.hash) == 0 {
  1161  		if fd.d.fs.allowRuntimeEnable {
  1162  			return 0, linuxerr.ENODATA
  1163  		}
  1164  		return 0, fd.d.fs.alertIntegrityViolation("Ioctl measureVerity: no hash found")
  1165  	}
  1166  
  1167  	// The first part of VerityDigest is the metadata.
  1168  	if _, err := metadata.CopyIn(t, verityDigest); err != nil {
  1169  		return 0, err
  1170  	}
  1171  	if metadata.DigestSize < uint16(len(fd.d.hash)) {
  1172  		return 0, linuxerr.EOVERFLOW
  1173  	}
  1174  
  1175  	// Populate the output digest size, since DigestSize is both input and
  1176  	// output.
  1177  	metadata.DigestSize = uint16(len(fd.d.hash))
  1178  
  1179  	// First copy the metadata.
  1180  	if _, err := metadata.CopyOut(t, verityDigest); err != nil {
  1181  		return 0, err
  1182  	}
  1183  
  1184  	// Now copy the root hash bytes to the memory after metadata.
  1185  	_, err := t.CopyOutBytes(hostarch.Addr(uintptr(verityDigest)+linux.SizeOfDigestMetadata), fd.d.hash)
  1186  	return 0, err
  1187  }
  1188  
  1189  func (fd *fileDescription) verityFlags(ctx context.Context, flags hostarch.Addr) (uintptr, error) {
  1190  	f := int32(0)
  1191  
  1192  	fd.d.hashMu.RLock()
  1193  	// All enabled files should store a hash. This flag is not settable via
  1194  	// FS_IOC_SETFLAGS.
  1195  	if len(fd.d.hash) != 0 {
  1196  		f |= linux.FS_VERITY_FL
  1197  	}
  1198  	fd.d.hashMu.RUnlock()
  1199  
  1200  	t := kernel.TaskFromContext(ctx)
  1201  	if t == nil {
  1202  		return 0, linuxerr.EINVAL
  1203  	}
  1204  	_, err := primitive.CopyInt32Out(t, flags, f)
  1205  	return 0, err
  1206  }
  1207  
  1208  // Ioctl implements vfs.FileDescriptionImpl.Ioctl.
  1209  func (fd *fileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
  1210  	switch cmd := args[1].Uint(); cmd {
  1211  	case linux.FS_IOC_ENABLE_VERITY:
  1212  		return fd.enableVerity(ctx)
  1213  	case linux.FS_IOC_MEASURE_VERITY:
  1214  		return fd.measureVerity(ctx, args[2].Pointer())
  1215  	case linux.FS_IOC_GETFLAGS:
  1216  		return fd.verityFlags(ctx, args[2].Pointer())
  1217  	default:
  1218  		return 0, syserror.ENOSYS
  1219  	}
  1220  }
  1221  
  1222  // Read implements vfs.FileDescriptionImpl.Read.
  1223  func (fd *fileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
  1224  	// Implement Read with PRead by setting offset.
  1225  	fd.mu.Lock()
  1226  	n, err := fd.PRead(ctx, dst, fd.off, opts)
  1227  	fd.off += n
  1228  	fd.mu.Unlock()
  1229  	return n, err
  1230  }
  1231  
  1232  // PRead implements vfs.FileDescriptionImpl.PRead.
  1233  func (fd *fileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
  1234  	// No need to verify if the file is not enabled yet in
  1235  	// allowRuntimeEnable mode.
  1236  	if !fd.d.verityEnabled() {
  1237  		return fd.lowerFD.PRead(ctx, dst, offset, opts)
  1238  	}
  1239  
  1240  	fd.d.fs.verityMu.RLock()
  1241  	defer fd.d.fs.verityMu.RUnlock()
  1242  	// dataSize is the size of the whole file.
  1243  	dataSize, err := fd.merkleReader.GetXattr(ctx, &vfs.GetXattrOptions{
  1244  		Name: merkleSizeXattr,
  1245  		Size: sizeOfStringInt32,
  1246  	})
  1247  
  1248  	// The Merkle tree file for the child should have been created and
  1249  	// contains the expected xattrs. If the xattr does not exist, it
  1250  	// indicates unexpected modifications to the file system.
  1251  	if linuxerr.Equals(linuxerr.ENODATA, err) {
  1252  		return 0, fd.d.fs.alertIntegrityViolation(fmt.Sprintf("Failed to get xattr %s: %v", merkleSizeXattr, err))
  1253  	}
  1254  	if err != nil {
  1255  		return 0, err
  1256  	}
  1257  
  1258  	// The dataSize xattr should be an integer. If it's not, it indicates
  1259  	// unexpected modifications to the file system.
  1260  	size, err := strconv.Atoi(dataSize)
  1261  	if err != nil {
  1262  		return 0, fd.d.fs.alertIntegrityViolation(fmt.Sprintf("Failed to convert xattr %s to int: %v", merkleSizeXattr, err))
  1263  	}
  1264  
  1265  	dataReader := FileReadWriteSeeker{
  1266  		FD:  fd.lowerFD,
  1267  		Ctx: ctx,
  1268  	}
  1269  
  1270  	merkleReader := FileReadWriteSeeker{
  1271  		FD:  fd.merkleReader,
  1272  		Ctx: ctx,
  1273  	}
  1274  
  1275  	fd.d.hashMu.RLock()
  1276  	n, err := merkletree.Verify(&merkletree.VerifyParams{
  1277  		Out:                   dst.Writer(ctx),
  1278  		File:                  &dataReader,
  1279  		Tree:                  &merkleReader,
  1280  		Size:                  int64(size),
  1281  		Name:                  fd.d.name,
  1282  		Mode:                  fd.d.mode,
  1283  		UID:                   fd.d.uid,
  1284  		GID:                   fd.d.gid,
  1285  		Children:              fd.d.childrenList,
  1286  		HashAlgorithms:        fd.d.fs.alg.toLinuxHashAlg(),
  1287  		ReadOffset:            offset,
  1288  		ReadSize:              dst.NumBytes(),
  1289  		Expected:              fd.d.hash,
  1290  		DataAndTreeInSameFile: false,
  1291  	})
  1292  	fd.d.hashMu.RUnlock()
  1293  	if err != nil {
  1294  		return 0, fd.d.fs.alertIntegrityViolation(fmt.Sprintf("Verification failed: %v", err))
  1295  	}
  1296  	return n, err
  1297  }
  1298  
  1299  // PWrite implements vfs.FileDescriptionImpl.PWrite.
  1300  func (fd *fileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
  1301  	return 0, linuxerr.EROFS
  1302  }
  1303  
  1304  // Write implements vfs.FileDescriptionImpl.Write.
  1305  func (fd *fileDescription) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
  1306  	return 0, linuxerr.EROFS
  1307  }
  1308  
  1309  // ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
  1310  func (fd *fileDescription) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
  1311  	if err := fd.lowerFD.ConfigureMMap(ctx, opts); err != nil {
  1312  		return err
  1313  	}
  1314  	fd.lowerMappable = opts.Mappable
  1315  	if opts.MappingIdentity != nil {
  1316  		opts.MappingIdentity.DecRef(ctx)
  1317  		opts.MappingIdentity = nil
  1318  	}
  1319  
  1320  	// Check if mmap is allowed on the lower filesystem.
  1321  	if !opts.SentryOwnedContent {
  1322  		return linuxerr.ENODEV
  1323  	}
  1324  	return vfs.GenericConfigureMMap(&fd.vfsfd, fd, opts)
  1325  }
  1326  
  1327  // SupportsLocks implements vfs.FileDescriptionImpl.SupportsLocks.
  1328  func (fd *fileDescription) SupportsLocks() bool {
  1329  	return fd.lowerFD.SupportsLocks()
  1330  }
  1331  
  1332  // LockBSD implements vfs.FileDescriptionImpl.LockBSD.
  1333  func (fd *fileDescription) LockBSD(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, block fslock.Blocker) error {
  1334  	return fd.lowerFD.LockBSD(ctx, ownerPID, t, block)
  1335  }
  1336  
  1337  // UnlockBSD implements vfs.FileDescriptionImpl.UnlockBSD.
  1338  func (fd *fileDescription) UnlockBSD(ctx context.Context, uid fslock.UniqueID) error {
  1339  	return fd.lowerFD.UnlockBSD(ctx)
  1340  }
  1341  
  1342  // LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
  1343  func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, r fslock.LockRange, block fslock.Blocker) error {
  1344  	return fd.lowerFD.LockPOSIX(ctx, uid, ownerPID, t, r, block)
  1345  }
  1346  
  1347  // UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
  1348  func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, r fslock.LockRange) error {
  1349  	return fd.lowerFD.UnlockPOSIX(ctx, uid, r)
  1350  }
  1351  
  1352  // TestPOSIX implements vfs.FileDescriptionImpl.TestPOSIX.
  1353  func (fd *fileDescription) TestPOSIX(ctx context.Context, uid fslock.UniqueID, t fslock.LockType, r fslock.LockRange) (linux.Flock, error) {
  1354  	return fd.lowerFD.TestPOSIX(ctx, uid, t, r)
  1355  }
  1356  
  1357  // Translate implements memmap.Mappable.Translate.
  1358  func (fd *fileDescription) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) {
  1359  	ts, err := fd.lowerMappable.Translate(ctx, required, optional, at)
  1360  	if err != nil {
  1361  		return nil, err
  1362  	}
  1363  
  1364  	// dataSize is the size of the whole file.
  1365  	dataSize, err := fd.merkleReader.GetXattr(ctx, &vfs.GetXattrOptions{
  1366  		Name: merkleSizeXattr,
  1367  		Size: sizeOfStringInt32,
  1368  	})
  1369  
  1370  	// The Merkle tree file for the child should have been created and
  1371  	// contains the expected xattrs. If the xattr does not exist, it
  1372  	// indicates unexpected modifications to the file system.
  1373  	if linuxerr.Equals(linuxerr.ENODATA, err) {
  1374  		return nil, fd.d.fs.alertIntegrityViolation(fmt.Sprintf("Failed to get xattr %s: %v", merkleSizeXattr, err))
  1375  	}
  1376  	if err != nil {
  1377  		return nil, err
  1378  	}
  1379  
  1380  	// The dataSize xattr should be an integer. If it's not, it indicates
  1381  	// unexpected modifications to the file system.
  1382  	size, err := strconv.Atoi(dataSize)
  1383  	if err != nil {
  1384  		return nil, fd.d.fs.alertIntegrityViolation(fmt.Sprintf("Failed to convert xattr %s to int: %v", merkleSizeXattr, err))
  1385  	}
  1386  
  1387  	merkleReader := FileReadWriteSeeker{
  1388  		FD:  fd.merkleReader,
  1389  		Ctx: ctx,
  1390  	}
  1391  
  1392  	for _, t := range ts {
  1393  		// Content integrity relies on sentry owning the backing data. MapInternal is guaranteed
  1394  		// to fetch sentry owned memory because we disallow verity mmaps otherwise.
  1395  		ims, err := t.File.MapInternal(memmap.FileRange{t.Offset, t.Offset + t.Source.Length()}, hostarch.Read)
  1396  		if err != nil {
  1397  			return nil, err
  1398  		}
  1399  		dataReader := mmapReadSeeker{ims, t.Source.Start}
  1400  		var buf bytes.Buffer
  1401  		_, err = merkletree.Verify(&merkletree.VerifyParams{
  1402  			Out:                   &buf,
  1403  			File:                  &dataReader,
  1404  			Tree:                  &merkleReader,
  1405  			Size:                  int64(size),
  1406  			Name:                  fd.d.name,
  1407  			Mode:                  fd.d.mode,
  1408  			UID:                   fd.d.uid,
  1409  			GID:                   fd.d.gid,
  1410  			HashAlgorithms:        fd.d.fs.alg.toLinuxHashAlg(),
  1411  			ReadOffset:            int64(t.Source.Start),
  1412  			ReadSize:              int64(t.Source.Length()),
  1413  			Expected:              fd.d.hash,
  1414  			DataAndTreeInSameFile: false,
  1415  		})
  1416  		if err != nil {
  1417  			return nil, fd.d.fs.alertIntegrityViolation(fmt.Sprintf("Verification failed: %v", err))
  1418  		}
  1419  	}
  1420  	return ts, err
  1421  }
  1422  
  1423  // AddMapping implements memmap.Mappable.AddMapping.
  1424  func (fd *fileDescription) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) error {
  1425  	return fd.lowerMappable.AddMapping(ctx, ms, ar, offset, writable)
  1426  }
  1427  
  1428  // RemoveMapping implements memmap.Mappable.RemoveMapping.
  1429  func (fd *fileDescription) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) {
  1430  	fd.lowerMappable.RemoveMapping(ctx, ms, ar, offset, writable)
  1431  }
  1432  
  1433  // CopyMapping implements memmap.Mappable.CopyMapping.
  1434  func (fd *fileDescription) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR hostarch.AddrRange, offset uint64, writable bool) error {
  1435  	return fd.lowerMappable.CopyMapping(ctx, ms, srcAR, dstAR, offset, writable)
  1436  }
  1437  
  1438  // InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
  1439  func (fd *fileDescription) InvalidateUnsavable(context.Context) error {
  1440  	return nil
  1441  }
  1442  
  1443  // mmapReadSeeker is a helper struct used by fileDescription.Translate to pass
  1444  // a safemem.BlockSeq pointing to the mapped region as io.ReaderAt.
  1445  type mmapReadSeeker struct {
  1446  	safemem.BlockSeq
  1447  	Offset uint64
  1448  }
  1449  
  1450  // ReadAt implements io.ReaderAt.ReadAt. off is the offset into the mapped file.
  1451  func (r *mmapReadSeeker) ReadAt(p []byte, off int64) (int, error) {
  1452  	bs := r.BlockSeq
  1453  	// Adjust the offset into the mapped file to get the offset into the internally
  1454  	// mapped region.
  1455  	readOffset := off - int64(r.Offset)
  1456  	if readOffset < 0 {
  1457  		return 0, linuxerr.EINVAL
  1458  	}
  1459  	bs.DropFirst64(uint64(readOffset))
  1460  	view := bs.TakeFirst64(uint64(len(p)))
  1461  	dst := safemem.BlockSeqOf(safemem.BlockFromSafeSlice(p))
  1462  	n, err := safemem.CopySeq(dst, view)
  1463  	return int(n), err
  1464  }
  1465  
  1466  // FileReadWriteSeeker is a helper struct to pass a vfs.FileDescription as
  1467  // io.Reader/io.Writer/io.ReadSeeker/io.ReaderAt/io.WriterAt/etc.
  1468  type FileReadWriteSeeker struct {
  1469  	FD    *vfs.FileDescription
  1470  	Ctx   context.Context
  1471  	ROpts vfs.ReadOptions
  1472  	WOpts vfs.WriteOptions
  1473  }
  1474  
  1475  // ReadAt implements io.ReaderAt.ReadAt.
  1476  func (f *FileReadWriteSeeker) ReadAt(p []byte, off int64) (int, error) {
  1477  	dst := usermem.BytesIOSequence(p)
  1478  	n, err := f.FD.PRead(f.Ctx, dst, off, f.ROpts)
  1479  	return int(n), err
  1480  }
  1481  
  1482  // Read implements io.ReadWriteSeeker.Read.
  1483  func (f *FileReadWriteSeeker) Read(p []byte) (int, error) {
  1484  	dst := usermem.BytesIOSequence(p)
  1485  	n, err := f.FD.Read(f.Ctx, dst, f.ROpts)
  1486  	return int(n), err
  1487  }
  1488  
  1489  // Seek implements io.ReadWriteSeeker.Seek.
  1490  func (f *FileReadWriteSeeker) Seek(offset int64, whence int) (int64, error) {
  1491  	return f.FD.Seek(f.Ctx, offset, int32(whence))
  1492  }
  1493  
  1494  // WriteAt implements io.WriterAt.WriteAt.
  1495  func (f *FileReadWriteSeeker) WriteAt(p []byte, off int64) (int, error) {
  1496  	dst := usermem.BytesIOSequence(p)
  1497  	n, err := f.FD.PWrite(f.Ctx, dst, off, f.WOpts)
  1498  	return int(n), err
  1499  }
  1500  
  1501  // Write implements io.ReadWriteSeeker.Write.
  1502  func (f *FileReadWriteSeeker) Write(p []byte) (int, error) {
  1503  	buf := usermem.BytesIOSequence(p)
  1504  	n, err := f.FD.Write(f.Ctx, buf, f.WOpts)
  1505  	return int(n), err
  1506  }