github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/vfs/mount.go (about) 1 // Copyright 2019 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package vfs 16 17 import ( 18 "bytes" 19 "fmt" 20 "math" 21 "sort" 22 "strings" 23 "sync/atomic" 24 25 "github.com/SagerNet/gvisor/pkg/abi/linux" 26 "github.com/SagerNet/gvisor/pkg/context" 27 "github.com/SagerNet/gvisor/pkg/errors/linuxerr" 28 "github.com/SagerNet/gvisor/pkg/refsvfs2" 29 "github.com/SagerNet/gvisor/pkg/sentry/kernel/auth" 30 "github.com/SagerNet/gvisor/pkg/syserror" 31 ) 32 33 // A Mount is a replacement of a Dentry (Mount.key.point) from one Filesystem 34 // (Mount.key.parent.fs) with a Dentry (Mount.root) from another Filesystem 35 // (Mount.fs), which applies to path resolution in the context of a particular 36 // Mount (Mount.key.parent). 37 // 38 // Mounts are reference-counted. Unless otherwise specified, all Mount methods 39 // require that a reference is held. 40 // 41 // Mount and Filesystem are distinct types because it's possible for a single 42 // Filesystem to be mounted at multiple locations and/or in multiple mount 43 // namespaces. 44 // 45 // Mount is analogous to Linux's struct mount. (gVisor does not distinguish 46 // between struct mount and struct vfsmount.) 47 // 48 // +stateify savable 49 type Mount struct { 50 // vfs, fs, root are immutable. References are held on fs and root. 51 // Note that for a disconnected mount, root may be nil. 52 // 53 // Invariant: if not nil, root belongs to fs. 54 vfs *VirtualFilesystem 55 fs *Filesystem 56 root *Dentry 57 58 // ID is the immutable mount ID. 59 ID uint64 60 61 // Flags contains settings as specified for mount(2), e.g. MS_NOEXEC, except 62 // for MS_RDONLY which is tracked in "writers". Immutable. 63 Flags MountFlags 64 65 // key is protected by VirtualFilesystem.mountMu and 66 // VirtualFilesystem.mounts.seq, and may be nil. References are held on 67 // key.parent and key.point if they are not nil. 68 // 69 // Invariant: key.parent != nil iff key.point != nil. key.point belongs to 70 // key.parent.fs. 71 key mountKey `state:".(VirtualDentry)"` 72 73 // ns is the namespace in which this Mount was mounted. ns is protected by 74 // VirtualFilesystem.mountMu. 75 ns *MountNamespace 76 77 // The lower 63 bits of refs are a reference count. The MSB of refs is set 78 // if the Mount has been eagerly umounted, as by umount(2) without the 79 // MNT_DETACH flag. refs is accessed using atomic memory operations. 80 refs int64 81 82 // children is the set of all Mounts for which Mount.key.parent is this 83 // Mount. children is protected by VirtualFilesystem.mountMu. 84 children map[*Mount]struct{} 85 86 // umounted is true if VFS.umountRecursiveLocked() has been called on this 87 // Mount. VirtualFilesystem does not hold a reference on Mounts for which 88 // umounted is true. umounted is protected by VirtualFilesystem.mountMu. 89 umounted bool 90 91 // The lower 63 bits of writers is the number of calls to 92 // Mount.CheckBeginWrite() that have not yet been paired with a call to 93 // Mount.EndWrite(). The MSB of writers is set if MS_RDONLY is in effect. 94 // writers is accessed using atomic memory operations. 95 writers int64 96 } 97 98 func newMount(vfs *VirtualFilesystem, fs *Filesystem, root *Dentry, mntns *MountNamespace, opts *MountOptions) *Mount { 99 mnt := &Mount{ 100 ID: atomic.AddUint64(&vfs.lastMountID, 1), 101 Flags: opts.Flags, 102 vfs: vfs, 103 fs: fs, 104 root: root, 105 ns: mntns, 106 refs: 1, 107 } 108 if opts.ReadOnly { 109 mnt.setReadOnlyLocked(true) 110 } 111 refsvfs2.Register(mnt) 112 return mnt 113 } 114 115 // Options returns a copy of the MountOptions currently applicable to mnt. 116 func (mnt *Mount) Options() MountOptions { 117 mnt.vfs.mountMu.Lock() 118 defer mnt.vfs.mountMu.Unlock() 119 return MountOptions{ 120 Flags: mnt.Flags, 121 ReadOnly: mnt.ReadOnly(), 122 } 123 } 124 125 // A MountNamespace is a collection of Mounts.// 126 // MountNamespaces are reference-counted. Unless otherwise specified, all 127 // MountNamespace methods require that a reference is held. 128 // 129 // MountNamespace is analogous to Linux's struct mnt_namespace. 130 // 131 // +stateify savable 132 type MountNamespace struct { 133 MountNamespaceRefs 134 135 // Owner is the usernamespace that owns this mount namespace. 136 Owner *auth.UserNamespace 137 138 // root is the MountNamespace's root mount. root is immutable. 139 root *Mount 140 141 // mountpoints maps all Dentries which are mount points in this namespace 142 // to the number of Mounts for which they are mount points. mountpoints is 143 // protected by VirtualFilesystem.mountMu. 144 // 145 // mountpoints is used to determine if a Dentry can be moved or removed 146 // (which requires that the Dentry is not a mount point in the calling 147 // namespace). 148 // 149 // mountpoints is maintained even if there are no references held on the 150 // MountNamespace; this is required to ensure that 151 // VFS.PrepareDeleteDentry() and VFS.PrepareRemoveDentry() operate 152 // correctly on unreferenced MountNamespaces. 153 mountpoints map[*Dentry]uint32 154 } 155 156 // NewMountNamespace returns a new mount namespace with a root filesystem 157 // configured by the given arguments. A reference is taken on the returned 158 // MountNamespace. 159 func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *MountOptions) (*MountNamespace, error) { 160 rft := vfs.getFilesystemType(fsTypeName) 161 if rft == nil { 162 ctx.Warningf("Unknown filesystem type: %s", fsTypeName) 163 return nil, linuxerr.ENODEV 164 } 165 fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, opts.GetFilesystemOptions) 166 if err != nil { 167 return nil, err 168 } 169 mntns := &MountNamespace{ 170 Owner: creds.UserNamespace, 171 mountpoints: make(map[*Dentry]uint32), 172 } 173 mntns.InitRefs() 174 mntns.root = newMount(vfs, fs, root, mntns, opts) 175 return mntns, nil 176 } 177 178 // NewDisconnectedMount returns a Mount representing fs with the given root 179 // (which may be nil). The new Mount is not associated with any MountNamespace 180 // and is not connected to any other Mounts. References are taken on fs and 181 // root. 182 func (vfs *VirtualFilesystem) NewDisconnectedMount(fs *Filesystem, root *Dentry, opts *MountOptions) (*Mount, error) { 183 fs.IncRef() 184 if root != nil { 185 root.IncRef() 186 } 187 return newMount(vfs, fs, root, nil /* mntns */, opts), nil 188 } 189 190 // MountDisconnected creates a Filesystem configured by the given arguments, 191 // then returns a Mount representing it. The new Mount is not associated with 192 // any MountNamespace and is not connected to any other Mounts. 193 func (vfs *VirtualFilesystem) MountDisconnected(ctx context.Context, creds *auth.Credentials, source string, fsTypeName string, opts *MountOptions) (*Mount, error) { 194 rft := vfs.getFilesystemType(fsTypeName) 195 if rft == nil { 196 return nil, linuxerr.ENODEV 197 } 198 if !opts.InternalMount && !rft.opts.AllowUserMount { 199 return nil, linuxerr.ENODEV 200 } 201 fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, opts.GetFilesystemOptions) 202 if err != nil { 203 return nil, err 204 } 205 defer root.DecRef(ctx) 206 defer fs.DecRef(ctx) 207 return vfs.NewDisconnectedMount(fs, root, opts) 208 } 209 210 // ConnectMountAt connects mnt at the path represented by target. 211 // 212 // Preconditions: mnt must be disconnected. 213 func (vfs *VirtualFilesystem) ConnectMountAt(ctx context.Context, creds *auth.Credentials, mnt *Mount, target *PathOperation) error { 214 // We can't hold vfs.mountMu while calling FilesystemImpl methods due to 215 // lock ordering. 216 vd, err := vfs.GetDentryAt(ctx, creds, target, &GetDentryOptions{}) 217 if err != nil { 218 return err 219 } 220 vfs.mountMu.Lock() 221 vdDentry := vd.dentry 222 vdDentry.mu.Lock() 223 for { 224 if vd.mount.umounted || vdDentry.dead { 225 vdDentry.mu.Unlock() 226 vfs.mountMu.Unlock() 227 vd.DecRef(ctx) 228 return syserror.ENOENT 229 } 230 // vd might have been mounted over between vfs.GetDentryAt() and 231 // vfs.mountMu.Lock(). 232 if !vdDentry.isMounted() { 233 break 234 } 235 nextmnt := vfs.mounts.Lookup(vd.mount, vdDentry) 236 if nextmnt == nil { 237 break 238 } 239 // It's possible that nextmnt has been umounted but not disconnected, 240 // in which case vfs no longer holds a reference on it, and the last 241 // reference may be concurrently dropped even though we're holding 242 // vfs.mountMu. 243 if !nextmnt.tryIncMountedRef() { 244 break 245 } 246 // This can't fail since we're holding vfs.mountMu. 247 nextmnt.root.IncRef() 248 vdDentry.mu.Unlock() 249 vd.DecRef(ctx) 250 vd = VirtualDentry{ 251 mount: nextmnt, 252 dentry: nextmnt.root, 253 } 254 vdDentry.mu.Lock() 255 } 256 // TODO(github.com/SagerNet/issue/1035): Linux requires that either both the mount 257 // point and the mount root are directories, or neither are, and returns 258 // ENOTDIR if this is not the case. 259 mntns := vd.mount.ns 260 vfs.mounts.seq.BeginWrite() 261 vfs.connectLocked(mnt, vd, mntns) 262 vfs.mounts.seq.EndWrite() 263 vdDentry.mu.Unlock() 264 vfs.mountMu.Unlock() 265 return nil 266 } 267 268 // MountAt creates and mounts a Filesystem configured by the given arguments. 269 // The VirtualFilesystem will hold a reference to the Mount until it is unmounted. 270 // 271 // This method returns the mounted Mount without a reference, for convenience 272 // during VFS setup when there is no chance of racing with unmount. 273 func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *MountOptions) (*Mount, error) { 274 mnt, err := vfs.MountDisconnected(ctx, creds, source, fsTypeName, opts) 275 if err != nil { 276 return nil, err 277 } 278 defer mnt.DecRef(ctx) 279 if err := vfs.ConnectMountAt(ctx, creds, mnt, target); err != nil { 280 return nil, err 281 } 282 return mnt, nil 283 } 284 285 // UmountAt removes the Mount at the given path. 286 func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *UmountOptions) error { 287 if opts.Flags&^(linux.MNT_FORCE|linux.MNT_DETACH) != 0 { 288 return linuxerr.EINVAL 289 } 290 291 // MNT_FORCE is currently unimplemented except for the permission check. 292 // Force unmounting specifically requires CAP_SYS_ADMIN in the root user 293 // namespace, and not in the owner user namespace for the target mount. See 294 // fs/namespace.c:SYSCALL_DEFINE2(umount, ...) 295 if opts.Flags&linux.MNT_FORCE != 0 && creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, creds.UserNamespace.Root()) { 296 return linuxerr.EPERM 297 } 298 299 vd, err := vfs.GetDentryAt(ctx, creds, pop, &GetDentryOptions{}) 300 if err != nil { 301 return err 302 } 303 defer vd.DecRef(ctx) 304 if vd.dentry != vd.mount.root { 305 return linuxerr.EINVAL 306 } 307 vfs.mountMu.Lock() 308 if mntns := MountNamespaceFromContext(ctx); mntns != nil { 309 defer mntns.DecRef(ctx) 310 if mntns != vd.mount.ns { 311 vfs.mountMu.Unlock() 312 return linuxerr.EINVAL 313 } 314 315 if vd.mount == vd.mount.ns.root { 316 vfs.mountMu.Unlock() 317 return linuxerr.EINVAL 318 } 319 } 320 321 // TODO(github.com/SagerNet/issue/1035): Linux special-cases umount of the caller's 322 // root, which we don't implement yet (we'll just fail it since the caller 323 // holds a reference on it). 324 325 vfs.mounts.seq.BeginWrite() 326 if opts.Flags&linux.MNT_DETACH == 0 { 327 if len(vd.mount.children) != 0 { 328 vfs.mounts.seq.EndWrite() 329 vfs.mountMu.Unlock() 330 return linuxerr.EBUSY 331 } 332 // We are holding a reference on vd.mount. 333 expectedRefs := int64(1) 334 if !vd.mount.umounted { 335 expectedRefs = 2 336 } 337 if atomic.LoadInt64(&vd.mount.refs)&^math.MinInt64 != expectedRefs { // mask out MSB 338 vfs.mounts.seq.EndWrite() 339 vfs.mountMu.Unlock() 340 return linuxerr.EBUSY 341 } 342 } 343 vdsToDecRef, mountsToDecRef := vfs.umountRecursiveLocked(vd.mount, &umountRecursiveOptions{ 344 eager: opts.Flags&linux.MNT_DETACH == 0, 345 disconnectHierarchy: true, 346 }, nil, nil) 347 vfs.mounts.seq.EndWrite() 348 vfs.mountMu.Unlock() 349 for _, vd := range vdsToDecRef { 350 vd.DecRef(ctx) 351 } 352 for _, mnt := range mountsToDecRef { 353 mnt.DecRef(ctx) 354 } 355 return nil 356 } 357 358 // +stateify savable 359 type umountRecursiveOptions struct { 360 // If eager is true, ensure that future calls to Mount.tryIncMountedRef() 361 // on umounted mounts fail. 362 // 363 // eager is analogous to Linux's UMOUNT_SYNC. 364 eager bool 365 366 // If disconnectHierarchy is true, Mounts that are umounted hierarchically 367 // should be disconnected from their parents. (Mounts whose parents are not 368 // umounted, which in most cases means the Mount passed to the initial call 369 // to umountRecursiveLocked, are unconditionally disconnected for 370 // consistency with Linux.) 371 // 372 // disconnectHierarchy is analogous to Linux's !UMOUNT_CONNECTED. 373 disconnectHierarchy bool 374 } 375 376 // umountRecursiveLocked marks mnt and its descendants as umounted. It does not 377 // release mount or dentry references; instead, it appends VirtualDentries and 378 // Mounts on which references must be dropped to vdsToDecRef and mountsToDecRef 379 // respectively, and returns updated slices. (This is necessary because 380 // filesystem locks possibly taken by DentryImpl.DecRef() may precede 381 // vfs.mountMu in the lock order, and Mount.DecRef() may lock vfs.mountMu.) 382 // 383 // umountRecursiveLocked is analogous to Linux's fs/namespace.c:umount_tree(). 384 // 385 // Preconditions: 386 // * vfs.mountMu must be locked. 387 // * vfs.mounts.seq must be in a writer critical section. 388 func (vfs *VirtualFilesystem) umountRecursiveLocked(mnt *Mount, opts *umountRecursiveOptions, vdsToDecRef []VirtualDentry, mountsToDecRef []*Mount) ([]VirtualDentry, []*Mount) { 389 if !mnt.umounted { 390 mnt.umounted = true 391 mountsToDecRef = append(mountsToDecRef, mnt) 392 if parent := mnt.parent(); parent != nil && (opts.disconnectHierarchy || !parent.umounted) { 393 vdsToDecRef = append(vdsToDecRef, vfs.disconnectLocked(mnt)) 394 } 395 } 396 if opts.eager { 397 for { 398 refs := atomic.LoadInt64(&mnt.refs) 399 if refs < 0 { 400 break 401 } 402 if atomic.CompareAndSwapInt64(&mnt.refs, refs, refs|math.MinInt64) { 403 break 404 } 405 } 406 } 407 for child := range mnt.children { 408 vdsToDecRef, mountsToDecRef = vfs.umountRecursiveLocked(child, opts, vdsToDecRef, mountsToDecRef) 409 } 410 return vdsToDecRef, mountsToDecRef 411 } 412 413 // connectLocked makes vd the mount parent/point for mnt. It consumes 414 // references held by vd. 415 // 416 // Preconditions: 417 // * vfs.mountMu must be locked. 418 // * vfs.mounts.seq must be in a writer critical section. 419 // * d.mu must be locked. 420 // * mnt.parent() == nil, i.e. mnt must not already be connected. 421 func (vfs *VirtualFilesystem) connectLocked(mnt *Mount, vd VirtualDentry, mntns *MountNamespace) { 422 if checkInvariants { 423 if mnt.parent() != nil { 424 panic("VFS.connectLocked called on connected mount") 425 } 426 } 427 mnt.IncRef() // dropped by callers of umountRecursiveLocked 428 mnt.setKey(vd) 429 if vd.mount.children == nil { 430 vd.mount.children = make(map[*Mount]struct{}) 431 } 432 vd.mount.children[mnt] = struct{}{} 433 atomic.AddUint32(&vd.dentry.mounts, 1) 434 mnt.ns = mntns 435 mntns.mountpoints[vd.dentry]++ 436 vfs.mounts.insertSeqed(mnt) 437 vfsmpmounts, ok := vfs.mountpoints[vd.dentry] 438 if !ok { 439 vfsmpmounts = make(map[*Mount]struct{}) 440 vfs.mountpoints[vd.dentry] = vfsmpmounts 441 } 442 vfsmpmounts[mnt] = struct{}{} 443 } 444 445 // disconnectLocked makes vd have no mount parent/point and returns its old 446 // mount parent/point with a reference held. 447 // 448 // Preconditions: 449 // * vfs.mountMu must be locked. 450 // * vfs.mounts.seq must be in a writer critical section. 451 // * mnt.parent() != nil. 452 func (vfs *VirtualFilesystem) disconnectLocked(mnt *Mount) VirtualDentry { 453 vd := mnt.getKey() 454 if checkInvariants { 455 if vd.mount != nil { 456 panic("VFS.disconnectLocked called on disconnected mount") 457 } 458 } 459 mnt.loadKey(VirtualDentry{}) 460 delete(vd.mount.children, mnt) 461 atomic.AddUint32(&vd.dentry.mounts, math.MaxUint32) // -1 462 mnt.ns.mountpoints[vd.dentry]-- 463 if mnt.ns.mountpoints[vd.dentry] == 0 { 464 delete(mnt.ns.mountpoints, vd.dentry) 465 } 466 vfs.mounts.removeSeqed(mnt) 467 vfsmpmounts := vfs.mountpoints[vd.dentry] 468 delete(vfsmpmounts, mnt) 469 if len(vfsmpmounts) == 0 { 470 delete(vfs.mountpoints, vd.dentry) 471 } 472 return vd 473 } 474 475 // tryIncMountedRef increments mnt's reference count and returns true. If mnt's 476 // reference count is already zero, or has been eagerly umounted, 477 // tryIncMountedRef does nothing and returns false. 478 // 479 // tryIncMountedRef does not require that a reference is held on mnt. 480 func (mnt *Mount) tryIncMountedRef() bool { 481 for { 482 r := atomic.LoadInt64(&mnt.refs) 483 if r <= 0 { // r < 0 => MSB set => eagerly unmounted 484 return false 485 } 486 if atomic.CompareAndSwapInt64(&mnt.refs, r, r+1) { 487 if mnt.LogRefs() { 488 refsvfs2.LogTryIncRef(mnt, r+1) 489 } 490 return true 491 } 492 } 493 } 494 495 // IncRef increments mnt's reference count. 496 func (mnt *Mount) IncRef() { 497 // In general, negative values for mnt.refs are valid because the MSB is 498 // the eager-unmount bit. 499 r := atomic.AddInt64(&mnt.refs, 1) 500 if mnt.LogRefs() { 501 refsvfs2.LogIncRef(mnt, r) 502 } 503 } 504 505 // DecRef decrements mnt's reference count. 506 func (mnt *Mount) DecRef(ctx context.Context) { 507 r := atomic.AddInt64(&mnt.refs, -1) 508 if mnt.LogRefs() { 509 refsvfs2.LogDecRef(mnt, r) 510 } 511 if r&^math.MinInt64 == 0 { // mask out MSB 512 refsvfs2.Unregister(mnt) 513 mnt.destroy(ctx) 514 } 515 } 516 517 func (mnt *Mount) destroy(ctx context.Context) { 518 var vd VirtualDentry 519 if mnt.parent() != nil { 520 mnt.vfs.mountMu.Lock() 521 mnt.vfs.mounts.seq.BeginWrite() 522 vd = mnt.vfs.disconnectLocked(mnt) 523 mnt.vfs.mounts.seq.EndWrite() 524 mnt.vfs.mountMu.Unlock() 525 } 526 if mnt.root != nil { 527 mnt.root.DecRef(ctx) 528 } 529 mnt.fs.DecRef(ctx) 530 if vd.Ok() { 531 vd.DecRef(ctx) 532 } 533 } 534 535 // RefType implements refsvfs2.CheckedObject.Type. 536 func (mnt *Mount) RefType() string { 537 return "vfs.Mount" 538 } 539 540 // LeakMessage implements refsvfs2.CheckedObject.LeakMessage. 541 func (mnt *Mount) LeakMessage() string { 542 return fmt.Sprintf("[vfs.Mount %p] reference count of %d instead of 0", mnt, atomic.LoadInt64(&mnt.refs)) 543 } 544 545 // LogRefs implements refsvfs2.CheckedObject.LogRefs. 546 // 547 // This should only be set to true for debugging purposes, as it can generate an 548 // extremely large amount of output and drastically degrade performance. 549 func (mnt *Mount) LogRefs() bool { 550 return false 551 } 552 553 // DecRef decrements mntns' reference count. 554 func (mntns *MountNamespace) DecRef(ctx context.Context) { 555 vfs := mntns.root.fs.VirtualFilesystem() 556 mntns.MountNamespaceRefs.DecRef(func() { 557 vfs.mountMu.Lock() 558 vfs.mounts.seq.BeginWrite() 559 vdsToDecRef, mountsToDecRef := vfs.umountRecursiveLocked(mntns.root, &umountRecursiveOptions{ 560 disconnectHierarchy: true, 561 }, nil, nil) 562 vfs.mounts.seq.EndWrite() 563 vfs.mountMu.Unlock() 564 for _, vd := range vdsToDecRef { 565 vd.DecRef(ctx) 566 } 567 for _, mnt := range mountsToDecRef { 568 mnt.DecRef(ctx) 569 } 570 }) 571 } 572 573 // getMountAt returns the last Mount in the stack mounted at (mnt, d). It takes 574 // a reference on the returned Mount. If (mnt, d) is not a mount point, 575 // getMountAt returns nil. 576 // 577 // getMountAt is analogous to Linux's fs/namei.c:follow_mount(). 578 // 579 // Preconditions: References are held on mnt and d. 580 func (vfs *VirtualFilesystem) getMountAt(ctx context.Context, mnt *Mount, d *Dentry) *Mount { 581 // The first mount is special-cased: 582 // 583 // - The caller is assumed to have checked d.isMounted() already. (This 584 // isn't a precondition because it doesn't matter for correctness.) 585 // 586 // - We return nil, instead of mnt, if there is no mount at (mnt, d). 587 // 588 // - We don't drop the caller's references on mnt and d. 589 retryFirst: 590 next := vfs.mounts.Lookup(mnt, d) 591 if next == nil { 592 return nil 593 } 594 if !next.tryIncMountedRef() { 595 // Raced with umount. 596 goto retryFirst 597 } 598 mnt = next 599 d = next.root 600 // We don't need to take Dentry refs anywhere in this function because 601 // Mounts hold references on Mount.root, which is immutable. 602 for d.isMounted() { 603 next := vfs.mounts.Lookup(mnt, d) 604 if next == nil { 605 break 606 } 607 if !next.tryIncMountedRef() { 608 // Raced with umount. 609 continue 610 } 611 mnt.DecRef(ctx) 612 mnt = next 613 d = next.root 614 } 615 return mnt 616 } 617 618 // getMountpointAt returns the mount point for the stack of Mounts including 619 // mnt. It takes a reference on the returned VirtualDentry. If no such mount 620 // point exists (i.e. mnt is a root mount), getMountpointAt returns (nil, nil). 621 // 622 // Preconditions: 623 // * References are held on mnt and root. 624 // * vfsroot is not (mnt, mnt.root). 625 func (vfs *VirtualFilesystem) getMountpointAt(ctx context.Context, mnt *Mount, vfsroot VirtualDentry) VirtualDentry { 626 // The first mount is special-cased: 627 // 628 // - The caller must have already checked mnt against vfsroot. 629 // 630 // - We return nil, instead of mnt, if there is no mount point for mnt. 631 // 632 // - We don't drop the caller's reference on mnt. 633 retryFirst: 634 epoch := vfs.mounts.seq.BeginRead() 635 parent, point := mnt.parent(), mnt.point() 636 if !vfs.mounts.seq.ReadOk(epoch) { 637 goto retryFirst 638 } 639 if parent == nil { 640 return VirtualDentry{} 641 } 642 if !parent.tryIncMountedRef() { 643 // Raced with umount. 644 goto retryFirst 645 } 646 if !point.TryIncRef() { 647 // Since Mount holds a reference on Mount.key.point, this can only 648 // happen due to a racing change to Mount.key. 649 parent.DecRef(ctx) 650 goto retryFirst 651 } 652 if !vfs.mounts.seq.ReadOk(epoch) { 653 point.DecRef(ctx) 654 parent.DecRef(ctx) 655 goto retryFirst 656 } 657 mnt = parent 658 d := point 659 for { 660 if mnt == vfsroot.mount && d == vfsroot.dentry { 661 break 662 } 663 if d != mnt.root { 664 break 665 } 666 retryNotFirst: 667 epoch := vfs.mounts.seq.BeginRead() 668 parent, point := mnt.parent(), mnt.point() 669 if !vfs.mounts.seq.ReadOk(epoch) { 670 goto retryNotFirst 671 } 672 if parent == nil { 673 break 674 } 675 if !parent.tryIncMountedRef() { 676 // Raced with umount. 677 goto retryNotFirst 678 } 679 if !point.TryIncRef() { 680 // Since Mount holds a reference on Mount.key.point, this can 681 // only happen due to a racing change to Mount.key. 682 parent.DecRef(ctx) 683 goto retryNotFirst 684 } 685 if !vfs.mounts.seq.ReadOk(epoch) { 686 point.DecRef(ctx) 687 parent.DecRef(ctx) 688 goto retryNotFirst 689 } 690 d.DecRef(ctx) 691 mnt.DecRef(ctx) 692 mnt = parent 693 d = point 694 } 695 return VirtualDentry{mnt, d} 696 } 697 698 // SetMountReadOnly sets the mount as ReadOnly. 699 func (vfs *VirtualFilesystem) SetMountReadOnly(mnt *Mount, ro bool) error { 700 vfs.mountMu.Lock() 701 defer vfs.mountMu.Unlock() 702 return mnt.setReadOnlyLocked(ro) 703 } 704 705 // CheckBeginWrite increments the counter of in-progress write operations on 706 // mnt. If mnt is mounted MS_RDONLY, CheckBeginWrite does nothing and returns 707 // EROFS. 708 // 709 // If CheckBeginWrite succeeds, EndWrite must be called when the write 710 // operation is finished. 711 func (mnt *Mount) CheckBeginWrite() error { 712 if atomic.AddInt64(&mnt.writers, 1) < 0 { 713 atomic.AddInt64(&mnt.writers, -1) 714 return linuxerr.EROFS 715 } 716 return nil 717 } 718 719 // EndWrite indicates that a write operation signaled by a previous successful 720 // call to CheckBeginWrite has finished. 721 func (mnt *Mount) EndWrite() { 722 atomic.AddInt64(&mnt.writers, -1) 723 } 724 725 // Preconditions: VirtualFilesystem.mountMu must be locked. 726 func (mnt *Mount) setReadOnlyLocked(ro bool) error { 727 if oldRO := atomic.LoadInt64(&mnt.writers) < 0; oldRO == ro { 728 return nil 729 } 730 if ro { 731 if !atomic.CompareAndSwapInt64(&mnt.writers, 0, math.MinInt64) { 732 return linuxerr.EBUSY 733 } 734 return nil 735 } 736 // Unset MSB without dropping any temporary increments from failed calls to 737 // mnt.CheckBeginWrite(). 738 atomic.AddInt64(&mnt.writers, math.MinInt64) 739 return nil 740 } 741 742 // ReadOnly returns true if mount is readonly. 743 func (mnt *Mount) ReadOnly() bool { 744 return atomic.LoadInt64(&mnt.writers) < 0 745 } 746 747 // Filesystem returns the mounted Filesystem. It does not take a reference on 748 // the returned Filesystem. 749 func (mnt *Mount) Filesystem() *Filesystem { 750 return mnt.fs 751 } 752 753 // submountsLocked returns this Mount and all Mounts that are descendents of 754 // it. 755 // 756 // Precondition: mnt.vfs.mountMu must be held. 757 func (mnt *Mount) submountsLocked() []*Mount { 758 mounts := []*Mount{mnt} 759 for m := range mnt.children { 760 mounts = append(mounts, m.submountsLocked()...) 761 } 762 return mounts 763 } 764 765 // Root returns the mount's root. It does not take a reference on the returned 766 // Dentry. 767 func (mnt *Mount) Root() *Dentry { 768 return mnt.root 769 } 770 771 // Root returns mntns' root. It does not take a reference on the returned Dentry. 772 func (mntns *MountNamespace) Root() VirtualDentry { 773 vd := VirtualDentry{ 774 mount: mntns.root, 775 dentry: mntns.root.root, 776 } 777 return vd 778 } 779 780 // GenerateProcMounts emits the contents of /proc/[pid]/mounts for vfs to buf. 781 // 782 // Preconditions: taskRootDir.Ok(). 783 func (vfs *VirtualFilesystem) GenerateProcMounts(ctx context.Context, taskRootDir VirtualDentry, buf *bytes.Buffer) { 784 rootMnt := taskRootDir.mount 785 786 vfs.mountMu.Lock() 787 mounts := rootMnt.submountsLocked() 788 // Take a reference on mounts since we need to drop vfs.mountMu before 789 // calling vfs.PathnameReachable() (=> FilesystemImpl.PrependPath()). 790 for _, mnt := range mounts { 791 mnt.IncRef() 792 } 793 vfs.mountMu.Unlock() 794 defer func() { 795 for _, mnt := range mounts { 796 mnt.DecRef(ctx) 797 } 798 }() 799 sort.Slice(mounts, func(i, j int) bool { return mounts[i].ID < mounts[j].ID }) 800 801 for _, mnt := range mounts { 802 // Get the path to this mount relative to task root. 803 mntRootVD := VirtualDentry{ 804 mount: mnt, 805 dentry: mnt.root, 806 } 807 path, err := vfs.PathnameReachable(ctx, taskRootDir, mntRootVD) 808 if err != nil { 809 // For some reason we didn't get a path. Log a warning 810 // and run with empty path. 811 ctx.Warningf("VFS.GenerateProcMounts: error getting pathname for mount root %+v: %v", mnt.root, err) 812 path = "" 813 } 814 if path == "" { 815 // Either an error occurred, or path is not reachable 816 // from root. 817 break 818 } 819 820 opts := "rw" 821 if mnt.ReadOnly() { 822 opts = "ro" 823 } 824 if mnt.Flags.NoATime { 825 opts = ",noatime" 826 } 827 if mnt.Flags.NoExec { 828 opts += ",noexec" 829 } 830 if mopts := mnt.fs.Impl().MountOptions(); mopts != "" { 831 opts += "," + mopts 832 } 833 834 // Format: 835 // <special device or remote filesystem> <mount point> <filesystem type> <mount options> <needs dump> <fsck order> 836 // 837 // The "needs dump" and "fsck order" flags are always 0, which 838 // is allowed. 839 fmt.Fprintf(buf, "%s %s %s %s %d %d\n", "none", path, mnt.fs.FilesystemType().Name(), opts, 0, 0) 840 } 841 } 842 843 // GenerateProcMountInfo emits the contents of /proc/[pid]/mountinfo for vfs to 844 // buf. 845 // 846 // Preconditions: taskRootDir.Ok(). 847 func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRootDir VirtualDentry, buf *bytes.Buffer) { 848 rootMnt := taskRootDir.mount 849 850 vfs.mountMu.Lock() 851 mounts := rootMnt.submountsLocked() 852 // Take a reference on mounts since we need to drop vfs.mountMu before 853 // calling vfs.PathnameReachable() (=> FilesystemImpl.PrependPath()) or 854 // vfs.StatAt() (=> FilesystemImpl.StatAt()). 855 for _, mnt := range mounts { 856 mnt.IncRef() 857 } 858 vfs.mountMu.Unlock() 859 defer func() { 860 for _, mnt := range mounts { 861 mnt.DecRef(ctx) 862 } 863 }() 864 sort.Slice(mounts, func(i, j int) bool { return mounts[i].ID < mounts[j].ID }) 865 866 creds := auth.CredentialsFromContext(ctx) 867 for _, mnt := range mounts { 868 // Get the path to this mount relative to task root. 869 mntRootVD := VirtualDentry{ 870 mount: mnt, 871 dentry: mnt.root, 872 } 873 path, err := vfs.PathnameReachable(ctx, taskRootDir, mntRootVD) 874 if err != nil { 875 // For some reason we didn't get a path. Log a warning 876 // and run with empty path. 877 ctx.Warningf("VFS.GenerateProcMountInfo: error getting pathname for mount root %+v: %v", mnt.root, err) 878 path = "" 879 } 880 if path == "" { 881 // Either an error occurred, or path is not reachable 882 // from root. 883 break 884 } 885 // Stat the mount root to get the major/minor device numbers. 886 pop := &PathOperation{ 887 Root: mntRootVD, 888 Start: mntRootVD, 889 } 890 statx, err := vfs.StatAt(ctx, creds, pop, &StatOptions{}) 891 if err != nil { 892 // Well that's not good. Ignore this mount. 893 ctx.Warningf("VFS.GenerateProcMountInfo: failed to stat mount root %+v: %v", mnt.root, err) 894 break 895 } 896 897 // Format: 898 // 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue 899 // (1)(2)(3) (4) (5) (6) (7) (8) (9) (10) (11) 900 901 // (1) Mount ID. 902 fmt.Fprintf(buf, "%d ", mnt.ID) 903 904 // (2) Parent ID (or this ID if there is no parent). 905 // Note that even if the call to mnt.parent() races with Mount 906 // destruction (which is possible since we're not holding vfs.mountMu), 907 // its Mount.ID will still be valid. 908 pID := mnt.ID 909 if p := mnt.parent(); p != nil { 910 pID = p.ID 911 } 912 fmt.Fprintf(buf, "%d ", pID) 913 914 // (3) Major:Minor device ID. We don't have a superblock, so we 915 // just use the root inode device number. 916 fmt.Fprintf(buf, "%d:%d ", statx.DevMajor, statx.DevMinor) 917 918 // (4) Root: the pathname of the directory in the filesystem 919 // which forms the root of this mount. 920 // 921 // NOTE(b/78135857): This will always be "/" until we implement 922 // bind mounts. 923 fmt.Fprintf(buf, "/ ") 924 925 // (5) Mount point (relative to process root). 926 fmt.Fprintf(buf, "%s ", manglePath(path)) 927 928 // (6) Mount options. 929 opts := "rw" 930 if mnt.ReadOnly() { 931 opts = "ro" 932 } 933 if mnt.Flags.NoATime { 934 opts = ",noatime" 935 } 936 if mnt.Flags.NoExec { 937 opts += ",noexec" 938 } 939 fmt.Fprintf(buf, "%s ", opts) 940 941 // (7) Optional fields: zero or more fields of the form "tag[:value]". 942 // (8) Separator: the end of the optional fields is marked by a single hyphen. 943 fmt.Fprintf(buf, "- ") 944 945 // (9) Filesystem type. 946 fmt.Fprintf(buf, "%s ", mnt.fs.FilesystemType().Name()) 947 948 // (10) Mount source: filesystem-specific information or "none". 949 fmt.Fprintf(buf, "none ") 950 951 // (11) Superblock options, and final newline. 952 fmt.Fprintf(buf, "%s\n", superBlockOpts(path, mnt)) 953 } 954 } 955 956 // manglePath replaces ' ', '\t', '\n', and '\\' with their octal equivalents. 957 // See Linux fs/seq_file.c:mangle_path. 958 func manglePath(p string) string { 959 r := strings.NewReplacer(" ", "\\040", "\t", "\\011", "\n", "\\012", "\\", "\\134") 960 return r.Replace(p) 961 } 962 963 // superBlockOpts returns the super block options string for the the mount at 964 // the given path. 965 func superBlockOpts(mountPath string, mnt *Mount) string { 966 // Compose super block options by combining global mount flags with 967 // FS-specific mount options. 968 opts := "rw" 969 if mnt.ReadOnly() { 970 opts = "ro" 971 } 972 973 if mopts := mnt.fs.Impl().MountOptions(); mopts != "" { 974 opts += "," + mopts 975 } 976 977 // NOTE(b/147673608): If the mount is a ramdisk-based fake cgroupfs, we also 978 // need to include the cgroup name in the options. For now we just read that 979 // from the path. Note that this is only possible when "cgroup" isn't 980 // registered as a valid filesystem type. 981 // 982 // TODO(github.com/SagerNet/issue/190): Once we removed fake cgroupfs support, we 983 // should remove this. 984 if cgroupfs := mnt.vfs.getFilesystemType("cgroup"); cgroupfs != nil && cgroupfs.opts.AllowUserMount { 985 // Real cgroupfs available. 986 return opts 987 } 988 if mnt.fs.FilesystemType().Name() == "cgroup" { 989 splitPath := strings.Split(mountPath, "/") 990 cgroupType := splitPath[len(splitPath)-1] 991 opts += "," + cgroupType 992 } 993 994 return opts 995 }