gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/vfs/mount.go (about) 1 // Copyright 2019 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package vfs 16 17 import ( 18 "bytes" 19 "fmt" 20 "math" 21 "sort" 22 "strings" 23 24 "gvisor.dev/gvisor/pkg/abi/linux" 25 "gvisor.dev/gvisor/pkg/atomicbitops" 26 "gvisor.dev/gvisor/pkg/cleanup" 27 "gvisor.dev/gvisor/pkg/context" 28 "gvisor.dev/gvisor/pkg/errors/linuxerr" 29 "gvisor.dev/gvisor/pkg/refs" 30 "gvisor.dev/gvisor/pkg/sentry/kernel/auth" 31 ) 32 33 // MountMax is the maximum number of mounts allowed. In Linux this can be 34 // configured by the user at /proc/sys/fs/mount-max, but the default is 35 // 100,000. We set the gVisor limit to 10,000. 36 const ( 37 MountMax = 10000 38 nsfsName = "nsfs" 39 cgroupFsName = "cgroup" 40 ) 41 42 // A Mount is a replacement of a Dentry (Mount.key.point) from one Filesystem 43 // (Mount.key.parent.fs) with a Dentry (Mount.root) from another Filesystem 44 // (Mount.fs), which applies to path resolution in the context of a particular 45 // Mount (Mount.key.parent). 46 // 47 // Mounts are reference-counted. Unless otherwise specified, all Mount methods 48 // require that a reference is held. 49 // 50 // Mount and Filesystem are distinct types because it's possible for a single 51 // Filesystem to be mounted at multiple locations and/or in multiple mount 52 // namespaces. 53 // 54 // Mount is analogous to Linux's struct mount. (gVisor does not distinguish 55 // between struct mount and struct vfsmount.) 56 // 57 // +stateify savable 58 type Mount struct { 59 // vfs, fs, root are immutable. References are held on fs and root. 60 // Note that for a disconnected mount, root may be nil. 61 // 62 // Invariant: if not nil, root belongs to fs. 63 vfs *VirtualFilesystem 64 fs *Filesystem 65 root *Dentry 66 67 // ID is the immutable mount ID. 68 ID uint64 69 70 // Flags contains settings as specified for mount(2), e.g. MS_NOEXEC, except 71 // for MS_RDONLY which is tracked in "writers". flags is protected by 72 // VirtualFilesystem.mountMu. 73 flags MountFlags 74 75 // key is protected by VirtualFilesystem.mountMu and 76 // VirtualFilesystem.mounts.seq, and may be nil. References are held on 77 // key.parent and key.point if they are not nil. 78 // 79 // Invariant: key.parent != nil iff key.point != nil. key.point belongs to 80 // key.parent.fs. 81 key mountKey `state:".(VirtualDentry)"` 82 83 // ns is the namespace in which this Mount was mounted. ns is protected by 84 // VirtualFilesystem.mountMu. 85 ns *MountNamespace 86 87 // The lower 63 bits of refs are a reference count. The MSB of refs is set 88 // if the Mount has been eagerly umounted, as by umount(2) without the 89 // MNT_DETACH flag. refs is accessed using atomic memory operations. 90 refs atomicbitops.Int64 91 92 // children is the set of all Mounts for which Mount.key.parent is this 93 // Mount. children is protected by VirtualFilesystem.mountMu. 94 children map[*Mount]struct{} 95 96 // isShared indicates this mount has the MS_SHARED propagation type. 97 isShared bool 98 99 // sharedEntry is an entry in a circular list (ring) of mounts in a shared 100 // peer group. 101 sharedEntry mountEntry 102 103 // followerList is a list of mounts which has this mount as its leader. 104 followerList followerList 105 106 // followerEntry is an entry in a followerList. 107 followerEntry 108 109 // leader is the mount that this mount receives propagation events from. 110 leader *Mount 111 112 // groupID is the ID for this mount's shared peer group. If the mount is not 113 // in a peer group, this is 0. 114 groupID uint32 115 116 // umounted is true if VFS.umountRecursiveLocked() has been called on this 117 // Mount. VirtualFilesystem does not hold a reference on Mounts for which 118 // umounted is true. umounted is protected by VirtualFilesystem.mountMu. 119 umounted bool 120 121 // locked is true if the mount cannot be unmounted in the current mount 122 // namespace. It is analogous to MNT_LOCKED in Linux. 123 locked bool 124 125 // The lower 63 bits of writers is the number of calls to 126 // Mount.CheckBeginWrite() that have not yet been paired with a call to 127 // Mount.EndWrite(). The MSB of writers is set if MS_RDONLY is in effect. 128 // writers is accessed using atomic memory operations. 129 writers atomicbitops.Int64 130 } 131 132 func newMount(vfs *VirtualFilesystem, fs *Filesystem, root *Dentry, mntns *MountNamespace, opts *MountOptions) *Mount { 133 mnt := &Mount{ 134 ID: vfs.lastMountID.Add(1), 135 flags: opts.Flags, 136 vfs: vfs, 137 fs: fs, 138 root: root, 139 ns: mntns, 140 locked: opts.Locked, 141 isShared: false, 142 refs: atomicbitops.FromInt64(1), 143 } 144 if opts.ReadOnly { 145 mnt.setReadOnlyLocked(true) 146 } 147 mnt.sharedEntry.Init(mnt) 148 refs.Register(mnt) 149 return mnt 150 } 151 152 // Options returns a copy of the MountOptions currently applicable to mnt. 153 func (mnt *Mount) Options() MountOptions { 154 mnt.vfs.lockMounts() 155 defer mnt.vfs.unlockMounts(context.Background()) 156 return MountOptions{ 157 Flags: mnt.flags, 158 ReadOnly: mnt.ReadOnlyLocked(), 159 } 160 } 161 162 // setMountOptions sets mnt's options to the given opts. 163 // 164 // Preconditions: 165 // - vfs.mountMu must be locked. 166 func (mnt *Mount) setMountOptions(opts *MountOptions) error { 167 if opts == nil { 168 return linuxerr.EINVAL 169 } 170 if err := mnt.setReadOnlyLocked(opts.ReadOnly); err != nil { 171 return err 172 } 173 mnt.flags = opts.Flags 174 return nil 175 } 176 177 // MountFlags returns a bit mask that indicates mount options. 178 func (mnt *Mount) MountFlags() uint64 { 179 mnt.vfs.lockMounts() 180 defer mnt.vfs.unlockMounts(context.Background()) 181 var flags uint64 182 if mnt.flags.NoExec { 183 flags |= linux.ST_NOEXEC 184 } 185 if mnt.flags.NoATime { 186 flags |= linux.ST_NOATIME 187 } 188 if mnt.flags.NoDev { 189 flags |= linux.ST_NODEV 190 } 191 if mnt.flags.NoSUID { 192 flags |= linux.ST_NOSUID 193 } 194 if mnt.ReadOnlyLocked() { 195 flags |= linux.ST_RDONLY 196 } 197 return flags 198 } 199 200 func (mnt *Mount) isFollower() bool { 201 return mnt.leader != nil 202 } 203 204 func (mnt *Mount) neverConnected() bool { 205 return mnt.ns == nil 206 } 207 208 // coveringMount returns a mount that completely covers mnt if it exists and nil 209 // otherwise. A mount that covers another is one that is the only child of its 210 // parent and whose mountpoint is its parent's root. 211 func (mnt *Mount) coveringMount() *Mount { 212 if len(mnt.children) != 1 { 213 return nil 214 } 215 // Get the child from the children map. 216 var child *Mount 217 for child = range mnt.children { 218 break 219 } 220 if child.point() != mnt.root { 221 return nil 222 } 223 return child 224 } 225 226 // validInMountNS checks if the mount is valid in the current mount namespace. This includes 227 // checking if has previously been unmounted. It is analogous to fs/namespace.c:check_mnt() in 228 // Linux. 229 // 230 // +checklocks:vfs.mountMu 231 func (vfs *VirtualFilesystem) validInMountNS(ctx context.Context, mnt *Mount) bool { 232 if mntns := MountNamespaceFromContext(ctx); mntns != nil { 233 vfs.delayDecRef(mntns) 234 return mnt.ns == mntns && !mnt.umounted 235 } 236 return false 237 } 238 239 // NewFilesystem creates a new filesystem object not yet associated with any 240 // mounts. It can be installed into the filesystem tree with ConnectMountAt. 241 // Note that only the filesystem-specific mount options from opts are used by 242 // this function, mount flags are ignored. To set mount flags, pass them to a 243 // corresponding ConnectMountAt. 244 func (vfs *VirtualFilesystem) NewFilesystem(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *MountOptions) (*Filesystem, *Dentry, error) { 245 rft := vfs.getFilesystemType(fsTypeName) 246 if rft == nil { 247 return nil, nil, linuxerr.ENODEV 248 } 249 if !opts.GetFilesystemOptions.InternalMount && !rft.opts.AllowUserMount { 250 return nil, nil, linuxerr.ENODEV 251 } 252 return rft.fsType.GetFilesystem(ctx, vfs, creds, source, opts.GetFilesystemOptions) 253 } 254 255 // NewDisconnectedMount returns a Mount representing fs with the given root 256 // (which may be nil). The new Mount is not associated with any MountNamespace 257 // and is not connected to any other Mounts. References are taken on fs and 258 // root. 259 func (vfs *VirtualFilesystem) NewDisconnectedMount(fs *Filesystem, root *Dentry, opts *MountOptions) *Mount { 260 fs.IncRef() 261 if root != nil { 262 root.IncRef() 263 } 264 return newMount(vfs, fs, root, nil /* mntns */, opts) 265 } 266 267 // MountDisconnected creates a Filesystem configured by the given arguments, 268 // then returns a Mount representing it. The new Mount is not associated with 269 // any MountNamespace and is not connected to any other Mounts. 270 func (vfs *VirtualFilesystem) MountDisconnected(ctx context.Context, creds *auth.Credentials, source string, fsTypeName string, opts *MountOptions) (*Mount, error) { 271 fs, root, err := vfs.NewFilesystem(ctx, creds, source, fsTypeName, opts) 272 if err != nil { 273 return nil, err 274 } 275 return newMount(vfs, fs, root, nil /* mntns */, opts), nil 276 } 277 278 // attachTreeLocked attaches the mount tree at mnt to mp and propagates the mount to mp.mount's 279 // peers and followers. This method consumes the reference on mp. It is analogous to 280 // fs/namespace.c:attach_recursive_mnt() in Linux. The mount point mp must have its dentry locked 281 // before calling attachTreeLocked. 282 // 283 // +checklocks:vfs.mountMu 284 func (vfs *VirtualFilesystem) attachTreeLocked(ctx context.Context, mnt *Mount, mp VirtualDentry) error { 285 cleanup := cleanup.Make(func() { 286 vfs.cleanupGroupIDs(mnt.submountsLocked()) // +checklocksforce 287 mp.dentry.mu.Unlock() 288 vfs.delayDecRef(mp) 289 }) 290 defer cleanup.Clean() 291 // This is equivalent to checking for SB_NOUSER in Linux, which is set on all 292 // anon mounts and sentry-internal filesystems like pipefs. 293 if mp.mount.neverConnected() { 294 return linuxerr.EINVAL 295 } 296 defer func() { mp.mount.ns.pending = 0 }() 297 if err := mp.mount.ns.checkMountCount(ctx, mnt); err != nil { 298 return err 299 } 300 301 var ( 302 propMnts map[*Mount]struct{} 303 err error 304 ) 305 if mp.mount.isShared { 306 if err := vfs.allocMountGroupIDs(mnt, true); err != nil { 307 return err 308 } 309 propMnts, err = vfs.doPropagation(ctx, mnt, mp) 310 if err != nil { 311 for pmnt := range propMnts { 312 if !pmnt.parent().neverConnected() { 313 pmnt.parent().ns.pending -= pmnt.countSubmountsLocked() 314 } 315 vfs.abortUncommitedMount(ctx, pmnt) 316 } 317 return err 318 } 319 } 320 cleanup.Release() 321 322 if mp.mount.isShared { 323 for _, m := range mnt.submountsLocked() { 324 m.isShared = true 325 } 326 } 327 vfs.mounts.seq.BeginWrite() 328 vfs.connectLocked(mnt, mp, mp.mount.ns) 329 vfs.mounts.seq.EndWrite() 330 mp.dentry.mu.Unlock() 331 vfs.commitChildren(ctx, mnt) 332 333 var owner *auth.UserNamespace 334 if mntns := MountNamespaceFromContext(ctx); mntns != nil { 335 owner = mntns.Owner 336 mntns.DecRef(ctx) 337 } 338 for pmnt := range propMnts { 339 vfs.commitMount(ctx, pmnt) 340 if pmnt.parent().ns.Owner != owner { 341 vfs.lockMountTree(pmnt) 342 } 343 pmnt.locked = false 344 } 345 return nil 346 } 347 348 // +checklocks:vfs.mountMu 349 func (vfs *VirtualFilesystem) lockMountTree(mnt *Mount) { 350 for _, m := range mnt.submountsLocked() { 351 // TODO(b/315839347): Add equivalents for MNT_LOCK_ATIME, 352 // MNT_LOCK_READONLY, etc. 353 m.locked = true 354 } 355 } 356 357 // +checklocks:vfs.mountMu 358 func (vfs *VirtualFilesystem) mountHasLockedChildren(mnt *Mount, vd VirtualDentry) bool { 359 for child := range mnt.children { 360 mp := child.getKey() 361 if !mp.mount.fs.Impl().IsDescendant(vd, mp) { 362 continue 363 } 364 if child.locked { 365 return true 366 } 367 } 368 return false 369 } 370 371 // ConnectMountAt connects mnt at the path represented by target. 372 // 373 // Preconditions: mnt must be disconnected. 374 func (vfs *VirtualFilesystem) ConnectMountAt(ctx context.Context, creds *auth.Credentials, mnt *Mount, target *PathOperation) error { 375 // We can't hold vfs.mountMu while calling FilesystemImpl methods due to 376 // lock ordering. 377 vd, err := vfs.GetDentryAt(ctx, creds, target, &GetDentryOptions{}) 378 if err != nil { 379 return err 380 } 381 vfs.lockMounts() 382 defer vfs.unlockMounts(ctx) 383 mp, err := vfs.lockMountpoint(vd) 384 if err != nil { 385 return err 386 } 387 if mp.mount.neverConnected() || mp.mount.umounted { 388 mp.dentry.mu.Unlock() 389 vfs.delayDecRef(mp) 390 return linuxerr.EINVAL 391 } 392 return vfs.attachTreeLocked(ctx, mnt, mp) 393 } 394 395 // lockMountpoint returns VirtualDentry with a locked Dentry. If vd is a 396 // mountpoint, the method returns a VirtualDentry with a locked Dentry that is 397 // the top most mount stacked on that Dentry. This method consumes a reference 398 // on vd and returns a VirtualDentry with an extra reference. It is analogous to 399 // fs/namespace.c:do_lock_mount() in Linux. 400 // 401 // +checklocks:vfs.mountMu 402 func (vfs *VirtualFilesystem) lockMountpoint(vd VirtualDentry) (VirtualDentry, error) { 403 vd.dentry.mu.Lock() 404 for { 405 if vd.mount.umounted || vd.dentry.dead { 406 vd.dentry.mu.Unlock() 407 vfs.delayDecRef(vd) 408 return VirtualDentry{}, linuxerr.ENOENT 409 } 410 // vd might have been mounted over between vfs.GetDentryAt() and 411 // vfs.mountMu.Lock(). 412 if !vd.dentry.isMounted() { 413 break 414 } 415 nextmnt := vfs.mounts.Lookup(vd.mount, vd.dentry) 416 if nextmnt == nil { 417 break 418 } 419 // It's possible that nextmnt has been umounted but not disconnected, 420 // in which case vfs no longer holds a reference on it, and the last 421 // reference may be concurrently dropped even though we're holding 422 // vfs.mountMu. 423 if !nextmnt.tryIncMountedRef() { 424 break 425 } 426 // This can't fail since we're holding vfs.mountMu. 427 nextmnt.root.IncRef() 428 vd.dentry.mu.Unlock() 429 vfs.delayDecRef(vd) 430 vd = VirtualDentry{ 431 mount: nextmnt, 432 dentry: nextmnt.root, 433 } 434 vd.dentry.mu.Lock() 435 } 436 return vd, nil 437 } 438 439 // CloneMountAt returns a new mount with the same fs, specified root and 440 // mount options. If mount options are nil, mnt's options are copied. The clone 441 // is added to mnt's peer group if mnt is shared. If not the clone is in a 442 // shared peer group by itself. 443 func (vfs *VirtualFilesystem) CloneMountAt(mnt *Mount, root *Dentry, mopts *MountOptions) (*Mount, error) { 444 vfs.lockMounts() 445 defer vfs.unlockMounts(context.Background()) 446 return vfs.cloneMount(mnt, root, mopts, makeSharedClone) 447 } 448 449 // cloneMount returns a new mount with mnt.fs as the filesystem and root as the 450 // root, with a propagation type specified by cloneType. The returned mount has 451 // an extra reference. If mopts is nil, use the options found in mnt. 452 // This method is analogous to fs/namespace.c:clone_mnt() in Linux. 453 // 454 // +checklocks:vfs.mountMu 455 func (vfs *VirtualFilesystem) cloneMount(mnt *Mount, root *Dentry, mopts *MountOptions, cloneType int) (*Mount, error) { 456 opts := mopts 457 if opts == nil { 458 opts = &MountOptions{ 459 Flags: mnt.flags, 460 ReadOnly: mnt.ReadOnlyLocked(), 461 } 462 } 463 clone := vfs.NewDisconnectedMount(mnt.fs, root, opts) 464 if cloneType&(makeFollowerClone|makePrivateClone|sharedToFollowerClone) != 0 { 465 clone.groupID = 0 466 } else { 467 clone.groupID = mnt.groupID 468 } 469 if cloneType&makeSharedClone != 0 && clone.groupID == 0 { 470 if err := vfs.allocateGroupID(clone); err != nil { 471 vfs.delayDecRef(clone) 472 return nil, err 473 } 474 } 475 clone.isShared = mnt.isShared 476 clone.locked = mnt.locked 477 if cloneType&makeFollowerClone != 0 || (cloneType&sharedToFollowerClone != 0 && mnt.isShared) { 478 mnt.followerList.PushFront(clone) 479 clone.leader = mnt 480 clone.isShared = false 481 } else if cloneType&makePrivateClone == 0 { 482 if cloneType&makeSharedClone != 0 || mnt.isShared { 483 mnt.sharedEntry.Add(&clone.sharedEntry) 484 } 485 if mnt.isFollower() { 486 mnt.leader.followerList.InsertAfter(mnt, clone) 487 } 488 clone.leader = mnt.leader 489 } else { 490 clone.isShared = false 491 } 492 if cloneType&makeSharedClone != 0 { 493 clone.isShared = true 494 } 495 return clone, nil 496 } 497 498 type cloneTreeNode struct { 499 prevMount *Mount 500 parentMount *Mount 501 } 502 503 // cloneMountTree creates a copy of mnt's tree with the specified root 504 // dentry at root. The new descendants are added to mnt's children list but are 505 // not connected with call to connectLocked. 506 // `cloneFunc` is a callback that is executed for each cloned mount. 507 // This method is analogous to fs/namespace.c:copy_tree() in Linux. 508 // 509 // +checklocks:vfs.mountMu 510 func (vfs *VirtualFilesystem) cloneMountTree(ctx context.Context, mnt *Mount, root *Dentry, cloneType int, cloneFunc func(ctx context.Context, oldmnt, newMnt *Mount)) (*Mount, error) { 511 clone, err := vfs.cloneMount(mnt, root, nil, cloneType) 512 if err != nil { 513 return nil, err 514 } 515 if cloneFunc != nil { 516 cloneFunc(ctx, mnt, clone) 517 } 518 queue := []cloneTreeNode{{mnt, clone}} 519 for len(queue) != 0 { 520 p := queue[len(queue)-1] 521 queue = queue[:len(queue)-1] 522 for c := range p.prevMount.children { 523 if mp := c.getKey(); p.prevMount == mnt && !mp.mount.fs.Impl().IsDescendant(VirtualDentry{mnt, root}, mp) { 524 continue 525 } 526 m, err := vfs.cloneMount(c, c.root, nil, cloneType) 527 if err != nil { 528 vfs.abortUncommitedMount(ctx, clone) 529 return nil, err 530 } 531 mp := VirtualDentry{ 532 mount: p.parentMount, 533 dentry: c.point(), 534 } 535 mp.IncRef() 536 m.setKey(mp) 537 if p.parentMount.children == nil { 538 p.parentMount.children = make(map[*Mount]struct{}) 539 } 540 p.parentMount.children[m] = struct{}{} 541 if len(c.children) != 0 { 542 queue = append(queue, cloneTreeNode{c, m}) 543 } 544 if cloneFunc != nil { 545 cloneFunc(ctx, c, m) 546 } 547 } 548 } 549 return clone, nil 550 } 551 552 // BindAt creates a clone of the source path's parent mount and mounts it at 553 // the target path. The new mount's root dentry is one pointed to by the source 554 // path. 555 func (vfs *VirtualFilesystem) BindAt(ctx context.Context, creds *auth.Credentials, source, target *PathOperation, recursive bool) error { 556 sourceVd, err := vfs.GetDentryAt(ctx, creds, source, &GetDentryOptions{}) 557 if err != nil { 558 return err 559 } 560 defer sourceVd.DecRef(ctx) 561 targetVd, err := vfs.GetDentryAt(ctx, creds, target, &GetDentryOptions{}) 562 if err != nil { 563 return err 564 } 565 566 vfs.lockMounts() 567 defer vfs.unlockMounts(ctx) 568 mp, err := vfs.lockMountpoint(targetVd) 569 if err != nil { 570 return err 571 } 572 cleanup := cleanup.Make(func() { 573 mp.dentry.mu.Unlock() 574 vfs.delayDecRef(mp) // +checklocksforce 575 }) 576 defer cleanup.Clean() 577 // Namespace mounts can be binded to other mount points. 578 fsName := sourceVd.mount.Filesystem().FilesystemType().Name() 579 if !vfs.validInMountNS(ctx, sourceVd.mount) && fsName != nsfsName && fsName != cgroupFsName { 580 return linuxerr.EINVAL 581 } 582 if !vfs.validInMountNS(ctx, mp.mount) { 583 return linuxerr.EINVAL 584 } 585 586 var clone *Mount 587 if recursive { 588 clone, err = vfs.cloneMountTree(ctx, sourceVd.mount, sourceVd.dentry, 0, nil) 589 } else { 590 if vfs.mountHasLockedChildren(sourceVd.mount, sourceVd) { 591 return linuxerr.EINVAL 592 } 593 clone, err = vfs.cloneMount(sourceVd.mount, sourceVd.dentry, nil, 0) 594 } 595 if err != nil { 596 return err 597 } 598 cleanup.Release() 599 600 vfs.delayDecRef(clone) 601 clone.locked = false 602 if err := vfs.attachTreeLocked(ctx, clone, mp); err != nil { 603 vfs.abortUncomittedChildren(ctx, clone) 604 return err 605 } 606 return nil 607 } 608 609 // RemountAt changes the mountflags and data of an existing mount without having to unmount and remount the filesystem. 610 func (vfs *VirtualFilesystem) RemountAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MountOptions) error { 611 vd, err := vfs.getMountpoint(ctx, creds, pop) 612 if err != nil { 613 return err 614 } 615 defer vd.DecRef(ctx) 616 vfs.lockMounts() 617 defer vfs.unlockMounts(ctx) 618 mnt := vd.Mount() 619 if !vfs.validInMountNS(ctx, mnt) { 620 return linuxerr.EINVAL 621 } 622 return mnt.setMountOptions(opts) 623 } 624 625 // MountAt creates and mounts a Filesystem configured by the given arguments. 626 // The VirtualFilesystem will hold a reference to the Mount until it is 627 // unmounted. 628 // 629 // This method returns the mounted Mount without a reference, for convenience 630 // during VFS setup when there is no chance of racing with unmount. 631 func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *MountOptions) (*Mount, error) { 632 mnt, err := vfs.MountDisconnected(ctx, creds, source, fsTypeName, opts) 633 if err != nil { 634 return nil, err 635 } 636 defer mnt.DecRef(ctx) 637 if err := vfs.ConnectMountAt(ctx, creds, mnt, target); err != nil { 638 return nil, err 639 } 640 return mnt, nil 641 } 642 643 // UmountAt removes the Mount at the given path. 644 func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *UmountOptions) error { 645 if opts.Flags&^(linux.MNT_FORCE|linux.MNT_DETACH) != 0 { 646 return linuxerr.EINVAL 647 } 648 649 // MNT_FORCE is currently unimplemented except for the permission check. 650 // Force unmounting specifically requires CAP_SYS_ADMIN in the root user 651 // namespace, and not in the owner user namespace for the target mount. See 652 // fs/namespace.c:SYSCALL_DEFINE2(umount, ...) 653 if opts.Flags&linux.MNT_FORCE != 0 && creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, creds.UserNamespace.Root()) { 654 return linuxerr.EPERM 655 } 656 vd, err := vfs.getMountpoint(ctx, creds, pop) 657 if err != nil { 658 return err 659 } 660 defer vd.DecRef(ctx) 661 662 vfs.lockMounts() 663 defer vfs.unlockMounts(ctx) 664 if vd.mount.locked { 665 return linuxerr.EINVAL 666 } 667 if !vfs.validInMountNS(ctx, vd.mount) { 668 return linuxerr.EINVAL 669 } 670 if vd.mount == vd.mount.ns.root { 671 return linuxerr.EINVAL 672 } 673 674 if opts.Flags&linux.MNT_DETACH == 0 && vfs.arePropMountsBusy(vd.mount) { 675 return linuxerr.EBUSY 676 } 677 678 // TODO(gvisor.dev/issue/1035): Linux special-cases umount of the caller's 679 // root, which we don't implement yet (we'll just fail it since the caller 680 // holds a reference on it). 681 vfs.umountTreeLocked(vd.mount, &umountRecursiveOptions{ 682 eager: opts.Flags&linux.MNT_DETACH == 0, 683 disconnectHierarchy: true, 684 propagate: true, 685 }) 686 return nil 687 } 688 689 // mountHasExpectedRefs checks that mnt has the correct number of references 690 // before a umount. It is analogous to fs/pnode.c:do_refcount_check(). 691 // 692 // +checklocks:vfs.mountMu 693 func (vfs *VirtualFilesystem) mountHasExpectedRefs(mnt *Mount) bool { 694 expectedRefs := int64(1) 695 if !mnt.umounted { 696 expectedRefs++ 697 } 698 if mnt.coveringMount() != nil { 699 expectedRefs++ 700 } 701 return mnt.refs.Load()&^math.MinInt64 == expectedRefs // mask out MSB 702 } 703 704 // +stateify savable 705 type umountRecursiveOptions struct { 706 // If eager is true, ensure that future calls to Mount.tryIncMountedRef() 707 // on umounted mounts fail. 708 // 709 // eager is analogous to Linux's UMOUNT_SYNC. 710 eager bool 711 712 // If disconnectHierarchy is true, Mounts that are umounted hierarchically 713 // should be disconnected from their parents. (Mounts whose parents are not 714 // umounted, which in most cases means the Mount passed to the initial call 715 // to umountRecursiveLocked, are unconditionally disconnected for 716 // consistency with Linux.) 717 // 718 // disconnectHierarchy is analogous to Linux's !UMOUNT_CONNECTED. 719 disconnectHierarchy bool 720 721 // If propagate is true, mounts located at the same point on the mount's 722 // parent's peers and follows will also be umounted if they do not have any 723 // children. 724 // 725 // propagate is analogous to Linux's UMOUNT_PROPAGATE. 726 propagate bool 727 } 728 729 // shouldUmount returns if this mount should be disconnected from its parent. 730 // It is analogous to fs/namespace.c:disconnect_mount() in Linux. 731 // 732 // +checklocks:vfs.mountMu 733 func (vfs *VirtualFilesystem) shouldUmount(mnt *Mount, opts *umountRecursiveOptions) bool { 734 // Always disconnect when it's not a lazy unmount. 735 if opts.eager { 736 return true 737 } 738 // If a mount does not have a parent, it won't be disconnected but will be 739 // DecRef-ed. 740 if mnt.parent() == nil { 741 return true 742 } 743 // Always unmount if the parent is not marked as unmounted. 744 if !mnt.parent().umounted { 745 return true 746 } 747 // If the parent is marked as unmounted, we can only unmount is 748 // UMOUNT_CONNECTED is false. 749 if !opts.disconnectHierarchy { 750 return false 751 } 752 if mnt.locked { 753 return false 754 } 755 return true 756 } 757 758 // umountTreeLocked marks mnt and its descendants as umounted. 759 // 760 // umountTreeLocked is analogous to Linux's fs/namespace.c:umount_tree(). 761 // +checklocks:vfs.mountMu 762 func (vfs *VirtualFilesystem) umountTreeLocked(mnt *Mount, opts *umountRecursiveOptions) { 763 if opts.propagate { 764 vfs.unlockPropagationMounts(mnt) 765 } 766 umountMnts := mnt.submountsLocked() 767 for _, mnt := range umountMnts { 768 vfs.umount(mnt) 769 } 770 if opts.propagate { 771 umountMnts = append(umountMnts, vfs.propagateUmount(umountMnts)...) 772 } 773 774 vfs.mounts.seq.BeginWrite() 775 for _, mnt := range umountMnts { 776 if opts.eager { 777 for { 778 refs := mnt.refs.Load() 779 if refs < 0 { 780 break 781 } 782 if mnt.refs.CompareAndSwap(refs, refs|math.MinInt64) { 783 break 784 } 785 } 786 } 787 if mnt.parent() != nil { 788 vfs.delayDecRef(mnt.getKey()) 789 if vfs.shouldUmount(mnt, opts) { 790 vfs.disconnectLocked(mnt) 791 } else { 792 // Restore mnt in it's parent children list with a reference, but leave 793 // it marked as unmounted. These partly unmounted mounts are cleaned up 794 // in vfs.forgetDeadMountpoints and Mount.destroy. We keep the extra 795 // reference on the mount but remove a reference on the mount point so 796 // that mount.Destroy is called when there are no other references on 797 // the parent. 798 mnt.IncRef() 799 mnt.parent().children[mnt] = struct{}{} 800 } 801 } 802 vfs.setPropagation(mnt, linux.MS_PRIVATE) 803 } 804 vfs.mounts.seq.EndWrite() 805 } 806 807 // +checklocks:vfs.mountMu 808 func (vfs *VirtualFilesystem) umount(mnt *Mount) { 809 if !mnt.umounted { 810 mnt.umounted = true 811 vfs.delayDecRef(mnt) 812 } 813 if parent := mnt.parent(); parent != nil { 814 delete(parent.children, mnt) 815 } 816 } 817 818 // changeMountpoint disconnects mnt from its current mount point and connects 819 // it to mp. It must be called from a vfs.mounts.seq writer critical section. 820 // 821 // +checklocks:vfs.mountMu 822 func (vfs *VirtualFilesystem) changeMountpoint(mnt *Mount, mp VirtualDentry) { 823 mp.dentry.mu.Lock() 824 vfs.delayDecRef(vfs.disconnectLocked(mnt)) 825 vfs.delayDecRef(mnt) 826 mp.IncRef() 827 vfs.connectLocked(mnt, mp, mp.mount.ns) 828 mp.dentry.mu.Unlock() 829 } 830 831 // connectLocked makes vd the mount parent/point for mnt. It consumes 832 // references held by vd. 833 // 834 // Preconditions: 835 // - vfs.mountMu must be locked. 836 // - vfs.mounts.seq must be in a writer critical section. 837 // - d.mu must be locked. 838 // - mnt.parent() == nil or mnt.parent().children doesn't contain mnt. 839 // i.e. mnt must not already be connected. 840 func (vfs *VirtualFilesystem) connectLocked(mnt *Mount, vd VirtualDentry, mntns *MountNamespace) { 841 if checkInvariants { 842 if mnt.parent() != nil && mnt.parent().children != nil { 843 if _, ok := mnt.parent().children[mnt]; ok { 844 panic("VFS.connectLocked called on connected mount") 845 } 846 } 847 } 848 mnt.IncRef() // dropped by vfs.umount(). 849 mnt.setKey(vd) 850 if vd.mount.children == nil { 851 vd.mount.children = make(map[*Mount]struct{}) 852 } 853 vd.mount.children[mnt] = struct{}{} 854 vd.dentry.mounts.Add(1) 855 mnt.ns = mntns 856 mntns.mountpoints[vd.dentry]++ 857 mntns.mounts++ 858 vfs.mounts.insertSeqed(mnt) 859 vfsmpmounts, ok := vfs.mountpoints[vd.dentry] 860 if !ok { 861 vfsmpmounts = make(map[*Mount]struct{}) 862 vfs.mountpoints[vd.dentry] = vfsmpmounts 863 } 864 vfsmpmounts[mnt] = struct{}{} 865 vfs.maybeResolveMountPromise(vd) 866 } 867 868 // disconnectLocked makes vd have no mount parent/point and returns its old 869 // mount parent/point with a reference held. 870 // 871 // Preconditions: 872 // - vfs.mountMu must be locked. 873 // - vfs.mounts.seq must be in a writer critical section. 874 // - mnt.parent() != nil. 875 func (vfs *VirtualFilesystem) disconnectLocked(mnt *Mount) VirtualDentry { 876 vd := mnt.getKey() 877 if checkInvariants { 878 if vd.mount == nil { 879 panic("VFS.disconnectLocked called on disconnected mount") 880 } 881 if mnt.ns.mountpoints[vd.dentry] == 0 { 882 panic("VFS.disconnectLocked called on dentry with zero mountpoints.") 883 } 884 if mnt.ns.mounts == 0 { 885 panic("VFS.disconnectLocked called on namespace with zero mounts.") 886 } 887 } 888 delete(vd.mount.children, mnt) 889 vd.dentry.mounts.Add(math.MaxUint32) // -1 890 mnt.ns.mountpoints[vd.dentry]-- 891 mnt.ns.mounts-- 892 if mnt.ns.mountpoints[vd.dentry] == 0 { 893 delete(mnt.ns.mountpoints, vd.dentry) 894 } 895 vfs.mounts.removeSeqed(mnt) 896 mnt.setKey(VirtualDentry{}) // Clear mnt.key. 897 vfsmpmounts := vfs.mountpoints[vd.dentry] 898 delete(vfsmpmounts, mnt) 899 if len(vfsmpmounts) == 0 { 900 delete(vfs.mountpoints, vd.dentry) 901 } 902 return vd 903 } 904 905 // tryIncMountedRef increments mnt's reference count and returns true. If mnt's 906 // reference count is already zero, or has been eagerly umounted, 907 // tryIncMountedRef does nothing and returns false. 908 // 909 // tryIncMountedRef does not require that a reference is held on mnt. 910 func (mnt *Mount) tryIncMountedRef() bool { 911 for { 912 r := mnt.refs.Load() 913 if r <= 0 { // r < 0 => MSB set => eagerly unmounted 914 return false 915 } 916 if mnt.refs.CompareAndSwap(r, r+1) { 917 if mnt.LogRefs() { 918 refs.LogTryIncRef(mnt, r+1) 919 } 920 return true 921 } 922 } 923 } 924 925 // IncRef increments mnt's reference count. 926 func (mnt *Mount) IncRef() { 927 // In general, negative values for mnt.refs are valid because the MSB is 928 // the eager-unmount bit. 929 r := mnt.refs.Add(1) 930 if mnt.LogRefs() { 931 refs.LogIncRef(mnt, r) 932 } 933 } 934 935 // DecRef decrements mnt's reference count. 936 func (mnt *Mount) DecRef(ctx context.Context) { 937 r := mnt.refs.Add(-1) 938 if mnt.LogRefs() { 939 refs.LogDecRef(mnt, r) 940 } 941 if r&^math.MinInt64 == 0 { // mask out MSB 942 refs.Unregister(mnt) 943 mnt.destroy(ctx) 944 } 945 } 946 947 func (mnt *Mount) destroy(ctx context.Context) { 948 mnt.vfs.lockMounts() 949 defer mnt.vfs.unlockMounts(ctx) 950 if mnt.parent() != nil { 951 mnt.vfs.mounts.seq.BeginWrite() 952 vd := mnt.vfs.disconnectLocked(mnt) 953 if vd.Ok() { 954 mnt.vfs.delayDecRef(vd) 955 } 956 mnt.vfs.mounts.seq.EndWrite() 957 } 958 959 // Cleanup any leftover children. The mount point has already been decref'd in 960 // umount so we just need to clean up the actual mounts. 961 if len(mnt.children) != 0 { 962 mnt.vfs.mounts.seq.BeginWrite() 963 for child := range mnt.children { 964 if checkInvariants { 965 if !child.umounted { 966 panic("children of a mount that has no references should already be marked as unmounted.") 967 } 968 } 969 mnt.vfs.disconnectLocked(child) 970 mnt.vfs.delayDecRef(child) 971 } 972 mnt.vfs.mounts.seq.EndWrite() 973 } 974 975 if mnt.root != nil { 976 mnt.vfs.delayDecRef(mnt.root) 977 } 978 mnt.vfs.delayDecRef(mnt.fs) 979 } 980 981 // RefType implements refs.CheckedObject.Type. 982 func (mnt *Mount) RefType() string { 983 return "vfs.Mount" 984 } 985 986 // LeakMessage implements refs.CheckedObject.LeakMessage. 987 func (mnt *Mount) LeakMessage() string { 988 return fmt.Sprintf("[vfs.Mount %p] reference count of %d instead of 0", mnt, mnt.refs.Load()) 989 } 990 991 // LogRefs implements refs.CheckedObject.LogRefs. 992 // 993 // This should only be set to true for debugging purposes, as it can generate an 994 // extremely large amount of output and drastically degrade performance. 995 func (mnt *Mount) LogRefs() bool { 996 return false 997 } 998 999 // getMountAt returns the last Mount in the stack mounted at (mnt, d). It takes 1000 // a reference on the returned Mount. If (mnt, d) is not a mount point, 1001 // getMountAt returns nil. 1002 // 1003 // getMountAt is analogous to Linux's fs/namei.c:follow_mount(). 1004 // 1005 // Preconditions: References are held on mnt and d. 1006 func (vfs *VirtualFilesystem) getMountAt(ctx context.Context, mnt *Mount, d *Dentry) *Mount { 1007 // The first mount is special-cased: 1008 // 1009 // - The caller is assumed to have checked d.isMounted() already. (This 1010 // isn't a precondition because it doesn't matter for correctness.) 1011 // 1012 // - We return nil, instead of mnt, if there is no mount at (mnt, d). 1013 // 1014 // - We don't drop the caller's references on mnt and d. 1015 retryFirst: 1016 next := vfs.mounts.Lookup(mnt, d) 1017 if next == nil { 1018 return nil 1019 } 1020 if !next.tryIncMountedRef() { 1021 // Raced with umount. 1022 goto retryFirst 1023 } 1024 mnt = next 1025 d = next.root 1026 // We don't need to take Dentry refs anywhere in this function because 1027 // Mounts hold references on Mount.root, which is immutable. 1028 for d.isMounted() { 1029 next := vfs.mounts.Lookup(mnt, d) 1030 if next == nil { 1031 break 1032 } 1033 if !next.tryIncMountedRef() { 1034 // Raced with umount. 1035 continue 1036 } 1037 mnt.DecRef(ctx) 1038 mnt = next 1039 d = next.root 1040 } 1041 return mnt 1042 } 1043 1044 // getMountpoint returns the top mount for the given path. 1045 // If the path is not a mountpoint, it returns an error. 1046 // 1047 // The returned VirtualDentry has an extra reference. 1048 func (vfs *VirtualFilesystem) getMountpoint(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (VirtualDentry, error) { 1049 vd, err := vfs.GetDentryAt(ctx, creds, pop, &GetDentryOptions{}) 1050 if err != nil { 1051 return VirtualDentry{}, err 1052 } 1053 // Linux passes the LOOKUP_MOUNPOINT flag to user_path_at in ksys_umount to 1054 // resolve to the toppmost mount in the stack located at the specified path. 1055 // vfs.GetMountAt() imitates this behavior. See fs/namei.c:user_path_at(...) 1056 // and fs/namespace.c:ksys_umount(...). 1057 if vd.dentry.isMounted() { 1058 if mnt := vfs.getMountAt(ctx, vd.mount, vd.dentry); mnt != nil { 1059 vd.mount.DecRef(ctx) 1060 vd.mount = mnt 1061 } 1062 } else if vd.dentry != vd.mount.root { 1063 vd.DecRef(ctx) 1064 return VirtualDentry{}, linuxerr.EINVAL 1065 } 1066 return vd, nil 1067 } 1068 1069 // getMountpointAt returns the mount point for the stack of Mounts including 1070 // mnt. It takes a reference on the returned VirtualDentry. If no such mount 1071 // point exists (i.e. mnt is a root mount), getMountpointAt returns (nil, nil). 1072 // 1073 // Preconditions: 1074 // - References are held on mnt and root. 1075 // - vfsroot is not (mnt, mnt.root). 1076 func (vfs *VirtualFilesystem) getMountpointAt(ctx context.Context, mnt *Mount, vfsroot VirtualDentry) VirtualDentry { 1077 // The first mount is special-cased: 1078 // 1079 // - The caller must have already checked mnt against vfsroot. 1080 // 1081 // - We return nil, instead of mnt, if there is no mount point for mnt. 1082 // 1083 // - We don't drop the caller's reference on mnt. 1084 retryFirst: 1085 epoch := vfs.mounts.seq.BeginRead() 1086 parent, point := mnt.parent(), mnt.point() 1087 if !vfs.mounts.seq.ReadOk(epoch) { 1088 goto retryFirst 1089 } 1090 if parent == nil { 1091 return VirtualDentry{} 1092 } 1093 if !parent.tryIncMountedRef() { 1094 // Raced with umount. 1095 goto retryFirst 1096 } 1097 if !point.TryIncRef() { 1098 // Since Mount holds a reference on Mount.key.point, this can only 1099 // happen due to a racing change to Mount.key. 1100 parent.DecRef(ctx) 1101 goto retryFirst 1102 } 1103 if !vfs.mounts.seq.ReadOk(epoch) { 1104 point.DecRef(ctx) 1105 parent.DecRef(ctx) 1106 goto retryFirst 1107 } 1108 mnt = parent 1109 d := point 1110 for { 1111 if mnt == vfsroot.mount && d == vfsroot.dentry { 1112 break 1113 } 1114 if d != mnt.root { 1115 break 1116 } 1117 retryNotFirst: 1118 epoch := vfs.mounts.seq.BeginRead() 1119 parent, point := mnt.parent(), mnt.point() 1120 if !vfs.mounts.seq.ReadOk(epoch) { 1121 goto retryNotFirst 1122 } 1123 if parent == nil { 1124 break 1125 } 1126 if !parent.tryIncMountedRef() { 1127 // Raced with umount. 1128 goto retryNotFirst 1129 } 1130 if !point.TryIncRef() { 1131 // Since Mount holds a reference on Mount.key.point, this can 1132 // only happen due to a racing change to Mount.key. 1133 parent.DecRef(ctx) 1134 goto retryNotFirst 1135 } 1136 if !vfs.mounts.seq.ReadOk(epoch) { 1137 point.DecRef(ctx) 1138 parent.DecRef(ctx) 1139 goto retryNotFirst 1140 } 1141 d.DecRef(ctx) 1142 mnt.DecRef(ctx) 1143 mnt = parent 1144 d = point 1145 } 1146 return VirtualDentry{mnt, d} 1147 } 1148 1149 // PivotRoot makes location pointed to by newRootPop the root of the current 1150 // namespace, and moves the current root to the location pointed to by 1151 // putOldPop. If the operation is successful, it returns virtual dentries for 1152 // the new root and the old root with an extra reference taken. 1153 func (vfs *VirtualFilesystem) PivotRoot(ctx context.Context, creds *auth.Credentials, newRootPop *PathOperation, putOldPop *PathOperation) (newRoot, oldRoot VirtualDentry, err error) { 1154 newRoot, err = vfs.GetDentryAt(ctx, creds, newRootPop, &GetDentryOptions{CheckSearchable: true}) 1155 if err != nil { 1156 return 1157 } 1158 defer newRoot.DecRef(ctx) 1159 1160 oldRoot = RootFromContext(ctx) 1161 defer oldRoot.DecRef(ctx) 1162 1163 putOldVd, err := vfs.GetDentryAt(ctx, creds, putOldPop, &GetDentryOptions{CheckSearchable: true}) 1164 if err != nil { 1165 return 1166 } 1167 vfs.lockMounts() 1168 defer vfs.unlockMounts(ctx) 1169 putOld, err := vfs.lockMountpoint(putOldVd) 1170 if err != nil { 1171 return 1172 } 1173 vfs.delayDecRef(putOld) 1174 1175 cleanup := cleanup.Make(func() { putOld.dentry.mu.Unlock() }) 1176 defer cleanup.Clean() 1177 // Neither new_root nor put_old can be on the same mount as the current 1178 // root mount. 1179 if newRoot.mount == oldRoot.mount || putOld.mount == oldRoot.mount { 1180 return newRoot, oldRoot, linuxerr.EBUSY 1181 } 1182 // new_root must be a mountpoint. 1183 if newRoot.mount.root != newRoot.dentry { 1184 return newRoot, oldRoot, linuxerr.EINVAL 1185 } 1186 // new_root must not be locked. 1187 if newRoot.mount.locked { 1188 return newRoot, oldRoot, linuxerr.EINVAL 1189 } 1190 // put_old must be at or underneath new_root. 1191 if !vfs.isPathReachable(ctx, newRoot, putOld) { 1192 return newRoot, oldRoot, linuxerr.EINVAL 1193 } 1194 // the new root must be at or underneath the current root. 1195 if !vfs.isPathReachable(ctx, oldRoot, newRoot) { 1196 return newRoot, oldRoot, linuxerr.EINVAL 1197 } 1198 // The current root directory must be a mountpoint 1199 // (in the case it has been chrooted). 1200 if oldRoot.mount.root != oldRoot.dentry { 1201 return newRoot, oldRoot, linuxerr.EINVAL 1202 } 1203 // The current root and the new root must be in the context's mount namespace. 1204 if !vfs.validInMountNS(ctx, oldRoot.mount) || !vfs.validInMountNS(ctx, newRoot.mount) { 1205 return newRoot, oldRoot, linuxerr.EINVAL 1206 } 1207 // The current root and the new root cannot be on the rootfs mount. 1208 if oldRoot.mount.parent() == nil || newRoot.mount.parent() == nil { 1209 return newRoot, oldRoot, linuxerr.EINVAL 1210 } 1211 // Either the mount point at new_root, or the parent mount of that mount 1212 // point, has propagation type MS_SHARED. 1213 if newRootParent := newRoot.mount.parent(); newRoot.mount.isShared || newRootParent.isShared { 1214 return newRoot, oldRoot, linuxerr.EINVAL 1215 } 1216 // put_old is a mount point and has the propagation type MS_SHARED. 1217 if putOld.mount.root == putOld.dentry && putOld.mount.isShared { 1218 return newRoot, oldRoot, linuxerr.EINVAL 1219 } 1220 cleanup.Release() 1221 1222 vfs.mounts.seq.BeginWrite() 1223 mp := vfs.disconnectLocked(newRoot.mount) 1224 vfs.delayDecRef(mp) 1225 rootMp := vfs.disconnectLocked(oldRoot.mount) 1226 if oldRoot.mount.locked { 1227 newRoot.mount.locked = true 1228 oldRoot.mount.locked = false 1229 } 1230 1231 putOld.IncRef() 1232 vfs.connectLocked(oldRoot.mount, putOld, putOld.mount.ns) 1233 putOld.dentry.mu.Unlock() 1234 1235 rootMp.dentry.mu.Lock() 1236 vfs.connectLocked(newRoot.mount, rootMp, rootMp.mount.ns) 1237 rootMp.dentry.mu.Unlock() 1238 vfs.mounts.seq.EndWrite() 1239 1240 vfs.delayDecRef(newRoot.mount) 1241 vfs.delayDecRef(oldRoot.mount) 1242 1243 newRoot.IncRef() 1244 oldRoot.IncRef() 1245 return 1246 } 1247 1248 // SetMountReadOnly sets the mount as ReadOnly. 1249 func (vfs *VirtualFilesystem) SetMountReadOnly(mnt *Mount, ro bool) error { 1250 vfs.lockMounts() 1251 defer vfs.unlockMounts(context.Background()) 1252 return mnt.setReadOnlyLocked(ro) 1253 } 1254 1255 // CheckBeginWrite increments the counter of in-progress write operations on 1256 // mnt. If mnt is mounted MS_RDONLY, CheckBeginWrite does nothing and returns 1257 // EROFS. 1258 // 1259 // If CheckBeginWrite succeeds, EndWrite must be called when the write 1260 // operation is finished. 1261 func (mnt *Mount) CheckBeginWrite() error { 1262 if mnt.writers.Add(1) < 0 { 1263 mnt.writers.Add(-1) 1264 return linuxerr.EROFS 1265 } 1266 return nil 1267 } 1268 1269 // EndWrite indicates that a write operation signaled by a previous successful 1270 // call to CheckBeginWrite has finished. 1271 func (mnt *Mount) EndWrite() { 1272 mnt.writers.Add(-1) 1273 } 1274 1275 // Preconditions: VirtualFilesystem.mountMu must be locked. 1276 func (mnt *Mount) setReadOnlyLocked(ro bool) error { 1277 if oldRO := mnt.writers.Load() < 0; oldRO == ro { 1278 return nil 1279 } 1280 if ro { 1281 if !mnt.writers.CompareAndSwap(0, math.MinInt64) { 1282 return linuxerr.EBUSY 1283 } 1284 return nil 1285 } 1286 // Unset MSB without dropping any temporary increments from failed calls to 1287 // mnt.CheckBeginWrite(). 1288 mnt.writers.Add(math.MinInt64) 1289 return nil 1290 } 1291 1292 // ReadOnly returns true if mount is readonly. 1293 func (mnt *Mount) ReadOnly() bool { 1294 mnt.vfs.lockMounts() 1295 defer mnt.vfs.unlockMounts(context.Background()) 1296 return mnt.writers.Load() < 0 1297 } 1298 1299 // ReadOnlyLocked returns true if mount is readonly. 1300 // 1301 // Preconditions: VirtualFilesystem.mountMu must be locked. 1302 func (mnt *Mount) ReadOnlyLocked() bool { 1303 return mnt.writers.Load() < 0 1304 } 1305 1306 // Filesystem returns the mounted Filesystem. It does not take a reference on 1307 // the returned Filesystem. 1308 func (mnt *Mount) Filesystem() *Filesystem { 1309 return mnt.fs 1310 } 1311 1312 // submountsLocked returns this Mount and all Mounts that are descendents of 1313 // it. 1314 // 1315 // Precondition: mnt.vfs.mountMu must be held. 1316 func (mnt *Mount) submountsLocked() []*Mount { 1317 mounts := []*Mount{mnt} 1318 for m := range mnt.children { 1319 mounts = append(mounts, m.submountsLocked()...) 1320 } 1321 return mounts 1322 } 1323 1324 // countSubmountsLocked returns mnt's total number of descendants including 1325 // uncommitted descendants. 1326 // 1327 // Precondition: mnt.vfs.mountMu must be held. 1328 func (mnt *Mount) countSubmountsLocked() uint32 { 1329 mounts := uint32(1) 1330 for m := range mnt.children { 1331 mounts += m.countSubmountsLocked() 1332 } 1333 return mounts 1334 } 1335 1336 // Root returns the mount's root. It does not take a reference on the returned 1337 // Dentry. 1338 func (mnt *Mount) Root() *Dentry { 1339 return mnt.root 1340 } 1341 1342 // GenerateProcMounts emits the contents of /proc/[pid]/mounts for vfs to buf. 1343 // 1344 // Preconditions: taskRootDir.Ok(). 1345 func (vfs *VirtualFilesystem) GenerateProcMounts(ctx context.Context, taskRootDir VirtualDentry, buf *bytes.Buffer) { 1346 rootMnt := taskRootDir.mount 1347 1348 vfs.lockMounts() 1349 mounts := rootMnt.submountsLocked() 1350 // Take a reference on mounts since we need to drop vfs.mountMu before 1351 // calling vfs.PathnameReachable() (=> FilesystemImpl.PrependPath()). 1352 for _, mnt := range mounts { 1353 mnt.IncRef() 1354 } 1355 vfs.unlockMounts(ctx) 1356 defer func() { 1357 for _, mnt := range mounts { 1358 mnt.DecRef(ctx) 1359 } 1360 }() 1361 sort.Slice(mounts, func(i, j int) bool { return mounts[i].ID < mounts[j].ID }) 1362 1363 for _, mnt := range mounts { 1364 // Get the path to this mount relative to task root. 1365 mntRootVD := VirtualDentry{ 1366 mount: mnt, 1367 dentry: mnt.root, 1368 } 1369 path, err := vfs.PathnameReachable(ctx, taskRootDir, mntRootVD) 1370 if err != nil { 1371 // For some reason we didn't get a path. Log a warning 1372 // and run with empty path. 1373 ctx.Warningf("VFS.GenerateProcMounts: error getting pathname for mount root %+v: %v", mnt.root, err) 1374 path = "" 1375 } 1376 if path == "" { 1377 // Either an error occurred, or path is not reachable 1378 // from root. 1379 break 1380 } 1381 1382 mntOpts := mnt.Options() 1383 opts := "rw" 1384 if mntOpts.ReadOnly { 1385 opts = "ro" 1386 } 1387 if mntOpts.Flags.NoATime { 1388 opts = ",noatime" 1389 } 1390 if mntOpts.Flags.NoExec { 1391 opts += ",noexec" 1392 } 1393 if mopts := mnt.fs.Impl().MountOptions(); mopts != "" { 1394 opts += "," + mopts 1395 } 1396 1397 // Format: 1398 // <special device or remote filesystem> <mount point> <filesystem type> <mount options> <needs dump> <fsck order> 1399 // 1400 // The "needs dump" and "fsck order" flags are always 0, which 1401 // is allowed. 1402 fmt.Fprintf(buf, "%s %s %s %s %d %d\n", "none", path, mnt.fs.FilesystemType().Name(), opts, 0, 0) 1403 } 1404 } 1405 1406 // GenerateProcMountInfo emits the contents of /proc/[pid]/mountinfo for vfs to 1407 // buf. 1408 // 1409 // Preconditions: taskRootDir.Ok(). 1410 func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRootDir VirtualDentry, buf *bytes.Buffer) { 1411 rootMnt := taskRootDir.mount 1412 1413 vfs.lockMounts() 1414 mounts := rootMnt.submountsLocked() 1415 // Take a reference on mounts since we need to drop vfs.mountMu before 1416 // calling vfs.PathnameReachable() (=> FilesystemImpl.PrependPath()) or 1417 // vfs.StatAt() (=> FilesystemImpl.StatAt()). 1418 for _, mnt := range mounts { 1419 mnt.IncRef() 1420 } 1421 vfs.unlockMounts(ctx) 1422 defer func() { 1423 for _, mnt := range mounts { 1424 mnt.DecRef(ctx) 1425 } 1426 }() 1427 sort.Slice(mounts, func(i, j int) bool { return mounts[i].ID < mounts[j].ID }) 1428 1429 creds := auth.CredentialsFromContext(ctx) 1430 for _, mnt := range mounts { 1431 // Get the path to this mount relative to task root. 1432 mntRootVD := VirtualDentry{ 1433 mount: mnt, 1434 dentry: mnt.root, 1435 } 1436 pathFromRoot, err := vfs.PathnameReachable(ctx, taskRootDir, mntRootVD) 1437 if err != nil { 1438 // For some reason we didn't get a path. Log a warning 1439 // and run with empty path. 1440 ctx.Warningf("VFS.GenerateProcMountInfo: error getting pathname for mount root %+v: %v", mnt.root, err) 1441 continue 1442 } 1443 if pathFromRoot == "" { 1444 // The path is not reachable from root. 1445 continue 1446 } 1447 var pathFromFS string 1448 pathFromFS, err = vfs.PathnameInFilesystem(ctx, mntRootVD) 1449 if err != nil { 1450 // For some reason we didn't get a path. Log a warning 1451 // and run with empty path. 1452 ctx.Warningf("VFS.GenerateProcMountInfo: error getting pathname for mount root %+v: %v", mnt.root, err) 1453 continue 1454 } 1455 if pathFromFS == "" { 1456 // The path is not reachable from root. 1457 continue 1458 } 1459 // Stat the mount root to get the major/minor device numbers. 1460 pop := &PathOperation{ 1461 Root: mntRootVD, 1462 Start: mntRootVD, 1463 } 1464 statx, err := vfs.StatAt(ctx, creds, pop, &StatOptions{}) 1465 if err != nil { 1466 // Well that's not good. Ignore this mount. 1467 ctx.Warningf("VFS.GenerateProcMountInfo: failed to stat mount root %+v: %v", mnt.root, err) 1468 continue 1469 } 1470 1471 // Format: 1472 // 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue 1473 // (1)(2)(3) (4) (5) (6) (7) (8) (9) (10) (11) 1474 1475 // (1) Mount ID. 1476 fmt.Fprintf(buf, "%d ", mnt.ID) 1477 1478 // (2) Parent ID (or this ID if there is no parent). 1479 // Note that even if the call to mnt.parent() races with Mount 1480 // destruction (which is possible since we're not holding vfs.mountMu), 1481 // its Mount.ID will still be valid. 1482 pID := mnt.ID 1483 if p := mnt.parent(); p != nil { 1484 pID = p.ID 1485 } 1486 fmt.Fprintf(buf, "%d ", pID) 1487 1488 // (3) Major:Minor device ID. We don't have a superblock, so we 1489 // just use the root inode device number. 1490 fmt.Fprintf(buf, "%d:%d ", statx.DevMajor, statx.DevMinor) 1491 1492 // (4) Root: the pathname of the directory in the filesystem 1493 // which forms the root of this mount. 1494 fmt.Fprintf(buf, "%s ", manglePath(pathFromFS)) 1495 1496 // (5) Mount point (relative to process root). 1497 fmt.Fprintf(buf, "%s ", manglePath(pathFromRoot)) 1498 1499 // (6) Mount options. 1500 opts := "rw" 1501 if mnt.ReadOnly() { 1502 opts = "ro" 1503 } 1504 if mnt.flags.NoATime { 1505 opts = ",noatime" 1506 } 1507 if mnt.flags.NoExec { 1508 opts += ",noexec" 1509 } 1510 fmt.Fprintf(buf, "%s ", opts) 1511 1512 // (7) Optional fields: zero or more fields of the form "tag[:value]". 1513 fmt.Fprintf(buf, "%s", vfs.generateOptionalTags(ctx, mnt, taskRootDir)) 1514 // (8) Separator: the end of the optional fields is marked by a single hyphen. 1515 fmt.Fprintf(buf, "- ") 1516 1517 // (9) Filesystem type. 1518 fmt.Fprintf(buf, "%s ", mnt.fs.FilesystemType().Name()) 1519 1520 // (10) Mount source: filesystem-specific information or "none". 1521 fmt.Fprintf(buf, "none ") 1522 1523 // (11) Superblock options, and final newline. 1524 fmt.Fprintf(buf, "%s\n", superBlockOpts(pathFromRoot, mnt)) 1525 } 1526 } 1527 1528 // manglePath replaces ' ', '\t', '\n', and '\\' with their octal equivalents. 1529 // See Linux fs/seq_file.c:mangle_path. 1530 func manglePath(p string) string { 1531 r := strings.NewReplacer(" ", "\\040", "\t", "\\011", "\n", "\\012", "\\", "\\134") 1532 return r.Replace(p) 1533 } 1534 1535 // superBlockOpts returns the super block options string for the mount at 1536 // the given path. 1537 func superBlockOpts(mountPath string, mnt *Mount) string { 1538 // Compose super block options by combining global mount flags with 1539 // FS-specific mount options. 1540 opts := "rw" 1541 if mnt.ReadOnly() { 1542 opts = "ro" 1543 } 1544 1545 if mopts := mnt.fs.Impl().MountOptions(); mopts != "" { 1546 opts += "," + mopts 1547 } 1548 1549 // NOTE(b/147673608): If the mount is a ramdisk-based fake cgroupfs, we also 1550 // need to include the cgroup name in the options. For now we just read that 1551 // from the path. Note that this is only possible when "cgroup" isn't 1552 // registered as a valid filesystem type. 1553 // 1554 // TODO(gvisor.dev/issue/190): Once we removed fake cgroupfs support, we 1555 // should remove this. 1556 if cgroupfs := mnt.vfs.getFilesystemType("cgroup"); cgroupfs != nil && cgroupfs.opts.AllowUserMount { 1557 // Real cgroupfs available. 1558 return opts 1559 } 1560 if mnt.fs.FilesystemType().Name() == "cgroup" { 1561 splitPath := strings.Split(mountPath, "/") 1562 cgroupType := splitPath[len(splitPath)-1] 1563 opts += "," + cgroupType 1564 } 1565 1566 return opts 1567 } 1568 1569 func (vfs *VirtualFilesystem) generateOptionalTags(ctx context.Context, mnt *Mount, root VirtualDentry) string { 1570 vfs.lockMounts() 1571 defer vfs.unlockMounts(ctx) 1572 // TODO(b/249777195): Support MS_UNBINDABLE propagation type. 1573 var optionalSb strings.Builder 1574 if mnt.isShared { 1575 optionalSb.WriteString(fmt.Sprintf("shared:%d ", mnt.groupID)) 1576 } 1577 if mnt.isFollower() { 1578 // Per man mount_namespaces(7), propagate_from should not be 1579 // included in optional tags if the leader "is the immediate leader of the 1580 // mount, or if there is no dominant peer group under the same root". A 1581 // dominant peer group is the nearest reachable mount in the leader/follower 1582 // chain. 1583 optionalSb.WriteString(fmt.Sprintf("master:%d ", mnt.leader.groupID)) 1584 var dominant *Mount 1585 for m := mnt.leader; m != nil; m = m.leader { 1586 if dominant = vfs.peerUnderRoot(ctx, m, mnt.ns, root); dominant != nil { 1587 break 1588 } 1589 } 1590 if dominant != nil && dominant != mnt.leader { 1591 optionalSb.WriteString(fmt.Sprintf("propagate_from:%d ", dominant.groupID)) 1592 } 1593 } 1594 return optionalSb.String() 1595 }