github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/vfs/mount.go (about) 1 // Copyright 2019 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package vfs 16 17 import ( 18 "bytes" 19 "fmt" 20 "math" 21 "sort" 22 "strings" 23 24 "github.com/nicocha30/gvisor-ligolo/pkg/abi/linux" 25 "github.com/nicocha30/gvisor-ligolo/pkg/atomicbitops" 26 "github.com/nicocha30/gvisor-ligolo/pkg/context" 27 "github.com/nicocha30/gvisor-ligolo/pkg/errors/linuxerr" 28 "github.com/nicocha30/gvisor-ligolo/pkg/refs" 29 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel/auth" 30 ) 31 32 // MountMax is the maximum number of mounts allowed. In Linux this can be 33 // configured by the user at /proc/sys/fs/mount-max, but the default is 34 // 100,000. We set the gVisor limit to 10,000. 35 const MountMax = 10000 36 37 // A Mount is a replacement of a Dentry (Mount.key.point) from one Filesystem 38 // (Mount.key.parent.fs) with a Dentry (Mount.root) from another Filesystem 39 // (Mount.fs), which applies to path resolution in the context of a particular 40 // Mount (Mount.key.parent). 41 // 42 // Mounts are reference-counted. Unless otherwise specified, all Mount methods 43 // require that a reference is held. 44 // 45 // Mount and Filesystem are distinct types because it's possible for a single 46 // Filesystem to be mounted at multiple locations and/or in multiple mount 47 // namespaces. 48 // 49 // Mount is analogous to Linux's struct mount. (gVisor does not distinguish 50 // between struct mount and struct vfsmount.) 51 // 52 // +stateify savable 53 type Mount struct { 54 // vfs, fs, root are immutable. References are held on fs and root. 55 // Note that for a disconnected mount, root may be nil. 56 // 57 // Invariant: if not nil, root belongs to fs. 58 vfs *VirtualFilesystem 59 fs *Filesystem 60 root *Dentry 61 62 // ID is the immutable mount ID. 63 ID uint64 64 65 // Flags contains settings as specified for mount(2), e.g. MS_NOEXEC, except 66 // for MS_RDONLY which is tracked in "writers". Immutable. 67 Flags MountFlags 68 69 // key is protected by VirtualFilesystem.mountMu and 70 // VirtualFilesystem.mounts.seq, and may be nil. References are held on 71 // key.parent and key.point if they are not nil. 72 // 73 // Invariant: key.parent != nil iff key.point != nil. key.point belongs to 74 // key.parent.fs. 75 key mountKey `state:".(VirtualDentry)"` 76 77 // ns is the namespace in which this Mount was mounted. ns is protected by 78 // VirtualFilesystem.mountMu. 79 ns *MountNamespace 80 81 // The lower 63 bits of refs are a reference count. The MSB of refs is set 82 // if the Mount has been eagerly umounted, as by umount(2) without the 83 // MNT_DETACH flag. refs is accessed using atomic memory operations. 84 refs atomicbitops.Int64 85 86 // children is the set of all Mounts for which Mount.key.parent is this 87 // Mount. children is protected by VirtualFilesystem.mountMu. 88 children map[*Mount]struct{} 89 90 // propagationType is propagation type of this mount. It can be shared or 91 // private. 92 propType PropagationType 93 94 // sharedList is a list of mounts in the shared peer group. It is nil if 95 // propType is not Shared. All mounts in a shared peer group hold the same 96 // sharedList. The mounts in sharedList do not need an extra reference taken 97 // because it would be redundant with the taken for being attached to a 98 // parent mount. If a mount is in a shared list if and only if it is attached 99 // and has the shared propagation type. 100 sharedList *sharedList 101 sharedEntry sharedEntry 102 103 // groupID is the ID for this mount's shared peer group. If the mount is not 104 // in a peer group, this is 0. 105 groupID uint32 106 107 // umounted is true if VFS.umountRecursiveLocked() has been called on this 108 // Mount. VirtualFilesystem does not hold a reference on Mounts for which 109 // umounted is true. umounted is protected by VirtualFilesystem.mountMu. 110 umounted bool 111 112 // The lower 63 bits of writers is the number of calls to 113 // Mount.CheckBeginWrite() that have not yet been paired with a call to 114 // Mount.EndWrite(). The MSB of writers is set if MS_RDONLY is in effect. 115 // writers is accessed using atomic memory operations. 116 writers atomicbitops.Int64 117 } 118 119 type sharedMapper struct{} 120 121 func (sharedMapper) linkerFor(mnt *Mount) *sharedEntry { return &mnt.sharedEntry } 122 123 func newMount(vfs *VirtualFilesystem, fs *Filesystem, root *Dentry, mntns *MountNamespace, opts *MountOptions) *Mount { 124 mnt := &Mount{ 125 ID: vfs.lastMountID.Add(1), 126 Flags: opts.Flags, 127 vfs: vfs, 128 fs: fs, 129 root: root, 130 ns: mntns, 131 propType: Private, 132 refs: atomicbitops.FromInt64(1), 133 } 134 if opts.ReadOnly { 135 mnt.setReadOnlyLocked(true) 136 } 137 refs.Register(mnt) 138 return mnt 139 } 140 141 // Options returns a copy of the MountOptions currently applicable to mnt. 142 func (mnt *Mount) Options() MountOptions { 143 mnt.vfs.mountMu.Lock() 144 defer mnt.vfs.mountMu.Unlock() 145 return MountOptions{ 146 Flags: mnt.Flags, 147 ReadOnly: mnt.ReadOnly(), 148 } 149 } 150 151 func (mnt *Mount) generateOptionalTags() string { 152 mnt.vfs.mountMu.Lock() 153 defer mnt.vfs.mountMu.Unlock() 154 // TODO(b/249777195): Support MS_SLAVE and MS_UNBINDABLE propagation types. 155 var optional string 156 if mnt.propType == Shared { 157 optional = fmt.Sprintf("shared:%d", mnt.groupID) 158 } 159 return optional 160 } 161 162 // A MountNamespace is a collection of Mounts.// 163 // MountNamespaces are reference-counted. Unless otherwise specified, all 164 // MountNamespace methods require that a reference is held. 165 // 166 // MountNamespace is analogous to Linux's struct mnt_namespace. 167 // 168 // +stateify savable 169 type MountNamespace struct { 170 MountNamespaceRefs 171 172 // Owner is the usernamespace that owns this mount namespace. 173 Owner *auth.UserNamespace 174 175 // root is the MountNamespace's root mount. 176 root *Mount 177 178 // mountpoints maps all Dentries which are mount points in this namespace 179 // to the number of Mounts for which they are mount points. mountpoints is 180 // protected by VirtualFilesystem.mountMu. 181 // 182 // mountpoints is used to determine if a Dentry can be moved or removed 183 // (which requires that the Dentry is not a mount point in the calling 184 // namespace). 185 // 186 // mountpoints is maintained even if there are no references held on the 187 // MountNamespace; this is required to ensure that 188 // VFS.PrepareDeleteDentry() and VFS.PrepareRemoveDentry() operate 189 // correctly on unreferenced MountNamespaces. 190 mountpoints map[*Dentry]uint32 191 192 // mounts is the total number of mounts in this mount namespace. 193 mounts uint32 194 } 195 196 // NewMountNamespace returns a new mount namespace with a root filesystem 197 // configured by the given arguments. A reference is taken on the returned 198 // MountNamespace. 199 func (vfs *VirtualFilesystem) NewMountNamespace(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *MountOptions) (*MountNamespace, error) { 200 rft := vfs.getFilesystemType(fsTypeName) 201 if rft == nil { 202 ctx.Warningf("Unknown filesystem type: %s", fsTypeName) 203 return nil, linuxerr.ENODEV 204 } 205 fs, root, err := rft.fsType.GetFilesystem(ctx, vfs, creds, source, opts.GetFilesystemOptions) 206 if err != nil { 207 return nil, err 208 } 209 return vfs.NewMountNamespaceFrom(ctx, creds, fs, root, opts), nil 210 } 211 212 // NewMountNamespaceFrom constructs a new mount namespace from an existing 213 // filesystem and its root dentry. This is similar to NewMountNamespace, but 214 // uses an existing filesystem instead of constructing a new one. 215 func (vfs *VirtualFilesystem) NewMountNamespaceFrom(ctx context.Context, creds *auth.Credentials, fs *Filesystem, root *Dentry, opts *MountOptions) *MountNamespace { 216 mntns := &MountNamespace{ 217 Owner: creds.UserNamespace, 218 mountpoints: make(map[*Dentry]uint32), 219 } 220 mntns.InitRefs() 221 mntns.root = newMount(vfs, fs, root, mntns, opts) 222 return mntns 223 } 224 225 // NewFilesystem creates a new filesystem object not yet associated with any 226 // mounts. It can be installed into the filesystem tree with ConnectMountAt. 227 // Note that only the filesystem-specific mount options from opts are used by 228 // this function, mount flags are ignored. To set mount flags, pass them to a 229 // corresponding ConnectMountAt. 230 func (vfs *VirtualFilesystem) NewFilesystem(ctx context.Context, creds *auth.Credentials, source, fsTypeName string, opts *MountOptions) (*Filesystem, *Dentry, error) { 231 rft := vfs.getFilesystemType(fsTypeName) 232 if rft == nil { 233 return nil, nil, linuxerr.ENODEV 234 } 235 if !opts.InternalMount && !rft.opts.AllowUserMount { 236 return nil, nil, linuxerr.ENODEV 237 } 238 return rft.fsType.GetFilesystem(ctx, vfs, creds, source, opts.GetFilesystemOptions) 239 } 240 241 // NewDisconnectedMount returns a Mount representing fs with the given root 242 // (which may be nil). The new Mount is not associated with any MountNamespace 243 // and is not connected to any other Mounts. References are taken on fs and 244 // root. 245 func (vfs *VirtualFilesystem) NewDisconnectedMount(fs *Filesystem, root *Dentry, opts *MountOptions) *Mount { 246 fs.IncRef() 247 if root != nil { 248 root.IncRef() 249 } 250 return newMount(vfs, fs, root, nil /* mntns */, opts) 251 } 252 253 // MountDisconnected creates a Filesystem configured by the given arguments, 254 // then returns a Mount representing it. The new Mount is not associated with 255 // any MountNamespace and is not connected to any other Mounts. 256 func (vfs *VirtualFilesystem) MountDisconnected(ctx context.Context, creds *auth.Credentials, source string, fsTypeName string, opts *MountOptions) (*Mount, error) { 257 fs, root, err := vfs.NewFilesystem(ctx, creds, source, fsTypeName, opts) 258 if err != nil { 259 return nil, err 260 } 261 return newMount(vfs, fs, root, nil /* mntns */, opts), nil 262 } 263 264 // ConnectMountAt connects mnt at the path represented by target. 265 // 266 // Preconditions: mnt must be disconnected. 267 func (vfs *VirtualFilesystem) ConnectMountAt(ctx context.Context, creds *auth.Credentials, mnt *Mount, target *PathOperation) error { 268 // We can't hold vfs.mountMu while calling FilesystemImpl methods due to 269 // lock ordering. 270 vd, err := vfs.GetDentryAt(ctx, creds, target, &GetDentryOptions{}) 271 if err != nil { 272 return err 273 } 274 vfs.mountMu.Lock() 275 tree := vfs.preparePropagationTree(mnt, vd) 276 // Check if the new mount + all the propagation mounts puts us over the max. 277 if uint32(len(tree)+1)+vd.mount.ns.mounts > MountMax { 278 // We need to unlock mountMu first because DecRef takes a lock on the 279 // filesystem mutex in some implementations, which can lead to circular 280 // locking. 281 vfs.abortPropagationTree(ctx, tree) 282 vfs.mountMu.Unlock() 283 vd.DecRef(ctx) 284 return linuxerr.ENOSPC 285 } 286 vdsToDecRef, err := vfs.connectMountAtLocked(ctx, mnt, vd) 287 defer func() { 288 for _, vd := range vdsToDecRef { 289 vd.DecRef(ctx) 290 } 291 }() 292 if err != nil { 293 vfs.abortPropagationTree(ctx, tree) 294 vfs.mountMu.Unlock() 295 return err 296 } 297 vfs.commitPropagationTree(ctx, tree) 298 vfs.mountMu.Unlock() 299 return nil 300 } 301 302 // connectMountAtLocked attaches mnt at vd. This method consumes a reference on 303 // vd and returns a list of VirtualDentry with an extra reference that must be 304 // DecRef'd outside of vfs.mountMu. 305 // 306 // Preconditions: 307 // - mnt must be disconnected. 308 // - vfs.mountMu must be locked. 309 // 310 // +checklocks:vfs.mountMu 311 func (vfs *VirtualFilesystem) connectMountAtLocked(ctx context.Context, mnt *Mount, vd VirtualDentry) ([]VirtualDentry, error) { 312 var vdsToDecRef []VirtualDentry 313 vd.dentry.mu.Lock() 314 for { 315 if vd.mount.umounted || vd.dentry.dead { 316 vd.dentry.mu.Unlock() 317 vdsToDecRef = append(vdsToDecRef, vd) 318 return vdsToDecRef, linuxerr.ENOENT 319 } 320 // vd might have been mounted over between vfs.GetDentryAt() and 321 // vfs.mountMu.Lock(). 322 if !vd.dentry.isMounted() { 323 break 324 } 325 nextmnt := vfs.mounts.Lookup(vd.mount, vd.dentry) 326 if nextmnt == nil { 327 break 328 } 329 // It's possible that nextmnt has been umounted but not disconnected, 330 // in which case vfs no longer holds a reference on it, and the last 331 // reference may be concurrently dropped even though we're holding 332 // vfs.mountMu. 333 if !nextmnt.tryIncMountedRef() { 334 break 335 } 336 // This can't fail since we're holding vfs.mountMu. 337 nextmnt.root.IncRef() 338 vd.dentry.mu.Unlock() 339 vdsToDecRef = append(vdsToDecRef, vd) 340 vd = VirtualDentry{ 341 mount: nextmnt, 342 dentry: nextmnt.root, 343 } 344 vd.dentry.mu.Lock() 345 } 346 // TODO(gvisor.dev/issue/1035): Linux requires that either both the mount 347 // point and the mount root are directories, or neither are, and returns 348 // ENOTDIR if this is not the case. 349 mntns := vd.mount.ns 350 vfs.mounts.seq.BeginWrite() 351 vfs.connectLocked(mnt, vd, mntns) 352 vfs.mounts.seq.EndWrite() 353 vd.dentry.mu.Unlock() 354 return vdsToDecRef, nil 355 } 356 357 // CloneMountAt returns a new mount with the same fs, specified root and 358 // mount options. If mnt's propagation type is shared the new mount is 359 // automatically made a peer of mnt. If mount options are nil, mnt's 360 // options are copied. 361 func (vfs *VirtualFilesystem) CloneMountAt(mnt *Mount, root *Dentry, mopts *MountOptions) *Mount { 362 vfs.mountMu.Lock() 363 defer vfs.mountMu.Unlock() 364 clone := vfs.cloneMount(mnt, root, mopts) 365 vfs.addPeer(mnt, clone) 366 return clone 367 } 368 369 // cloneMount returns a new mount with mnt.fs as the filesystem and root as the 370 // root. The returned mount has an extra reference. 371 // 372 // +checklocks:vfs.mountMu 373 // +checklocksalias:mnt.vfs.mountMu=vfs.mountMu 374 func (vfs *VirtualFilesystem) cloneMount(mnt *Mount, root *Dentry, mopts *MountOptions) *Mount { 375 opts := mopts 376 if opts == nil { 377 opts = &MountOptions{ 378 Flags: mnt.Flags, 379 ReadOnly: mnt.ReadOnly(), 380 } 381 } 382 return vfs.NewDisconnectedMount(mnt.fs, root, opts) 383 } 384 385 // BindAt creates a clone of the source path's parent mount and mounts it at 386 // the target path. The new mount's root dentry is one pointed to by the source 387 // path. 388 // 389 // TODO(b/249121230): Support recursive bind mounting. 390 func (vfs *VirtualFilesystem) BindAt(ctx context.Context, creds *auth.Credentials, source, target *PathOperation) (*Mount, error) { 391 sourceVd, err := vfs.GetDentryAt(ctx, creds, source, &GetDentryOptions{}) 392 if err != nil { 393 return nil, err 394 } 395 defer sourceVd.DecRef(ctx) 396 targetVd, err := vfs.GetDentryAt(ctx, creds, target, &GetDentryOptions{}) 397 if err != nil { 398 return nil, err 399 } 400 401 vfs.mountMu.Lock() 402 clone := vfs.cloneMount(sourceVd.mount, sourceVd.dentry, nil) 403 defer clone.DecRef(ctx) 404 tree := vfs.preparePropagationTree(clone, targetVd) 405 if sourceVd.mount.propType == Shared { 406 if clone.propType == Private { 407 vfs.addPeer(sourceVd.mount, clone) 408 } else { 409 vfs.mergePeerGroup(sourceVd.mount, clone) 410 } 411 } 412 if uint32(1+len(tree))+targetVd.mount.ns.mounts > MountMax { 413 vfs.setPropagation(clone, Private) 414 vfs.abortPropagationTree(ctx, tree) 415 vfs.mountMu.Unlock() 416 targetVd.DecRef(ctx) 417 return nil, linuxerr.ENOSPC 418 } 419 420 vdsToDecRef, err := vfs.connectMountAtLocked(ctx, clone, targetVd) 421 defer func() { 422 for _, vd := range vdsToDecRef { 423 vd.DecRef(ctx) 424 } 425 }() 426 if err != nil { 427 vfs.setPropagation(clone, Private) 428 vfs.abortPropagationTree(ctx, tree) 429 vfs.mountMu.Unlock() 430 return nil, err 431 } 432 vfs.commitPropagationTree(ctx, tree) 433 vfs.mountMu.Unlock() 434 return clone, nil 435 } 436 437 // MountAt creates and mounts a Filesystem configured by the given arguments. 438 // The VirtualFilesystem will hold a reference to the Mount until it is 439 // unmounted. 440 // 441 // This method returns the mounted Mount without a reference, for convenience 442 // during VFS setup when there is no chance of racing with unmount. 443 func (vfs *VirtualFilesystem) MountAt(ctx context.Context, creds *auth.Credentials, source string, target *PathOperation, fsTypeName string, opts *MountOptions) (*Mount, error) { 444 mnt, err := vfs.MountDisconnected(ctx, creds, source, fsTypeName, opts) 445 if err != nil { 446 return nil, err 447 } 448 defer mnt.DecRef(ctx) 449 if err := vfs.ConnectMountAt(ctx, creds, mnt, target); err != nil { 450 return nil, err 451 } 452 return mnt, nil 453 } 454 455 // UmountAt removes the Mount at the given path. 456 func (vfs *VirtualFilesystem) UmountAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *UmountOptions) error { 457 if opts.Flags&^(linux.MNT_FORCE|linux.MNT_DETACH) != 0 { 458 return linuxerr.EINVAL 459 } 460 461 // MNT_FORCE is currently unimplemented except for the permission check. 462 // Force unmounting specifically requires CAP_SYS_ADMIN in the root user 463 // namespace, and not in the owner user namespace for the target mount. See 464 // fs/namespace.c:SYSCALL_DEFINE2(umount, ...) 465 if opts.Flags&linux.MNT_FORCE != 0 && creds.HasCapabilityIn(linux.CAP_SYS_ADMIN, creds.UserNamespace.Root()) { 466 return linuxerr.EPERM 467 } 468 469 vd, err := vfs.GetDentryAt(ctx, creds, pop, &GetDentryOptions{}) 470 if err != nil { 471 return err 472 } 473 // This defer statement is encapsulated in a function because vd.mount can be 474 // modified in the block below. The arguments to defer are evaluated during 475 // the construction of a defer statement, so if vd.DecRef() was not 476 // encapsulated, the vd structure and its underlying pointers _at this point_ 477 // would be copied and DecRefd at the end of this function. 478 defer func() { 479 vd.DecRef(ctx) 480 }() 481 // Linux passes the LOOKUP_MOUNPOINT flag to user_path_at in ksys_umount to 482 // resolve to the toppmost mount in the stack located at the specified path. 483 // vfs.GetMountAt() imitiates this behavior. See fs/namei.c:user_path_at(...) 484 // and fs/namespace.c:ksys_umount(...). 485 if vd.dentry.isMounted() { 486 if realmnt := vfs.getMountAt(ctx, vd.mount, vd.dentry); realmnt != nil { 487 vd.mount.DecRef(ctx) 488 vd.mount = realmnt 489 } 490 } else if vd.dentry != vd.mount.root { 491 return linuxerr.EINVAL 492 } 493 494 vfs.mountMu.Lock() 495 if mntns := MountNamespaceFromContext(ctx); mntns != nil { 496 defer mntns.DecRef(ctx) 497 if mntns != vd.mount.ns { 498 vfs.mountMu.Unlock() 499 return linuxerr.EINVAL 500 } 501 502 if vd.mount == vd.mount.ns.root { 503 vfs.mountMu.Unlock() 504 return linuxerr.EINVAL 505 } 506 } 507 508 umountTree := []*Mount{vd.mount} 509 parent, mountpoint := vd.mount.parent(), vd.mount.point() 510 if parent != nil && parent.propType == Shared { 511 for peer := parent.sharedList.Front(); peer != nil; peer = peer.sharedEntry.Next() { 512 if peer == parent { 513 continue 514 } 515 umountMnt := vfs.mounts.Lookup(peer, mountpoint) 516 // From https://www.kernel.org/doc/Documentation/filesystems/sharedsubtree.txt: 517 // If any peer has some child mounts, then that mount is not unmounted, 518 // but all other mounts are unmounted. 519 if umountMnt != nil && len(umountMnt.children) == 0 { 520 umountTree = append(umountTree, umountMnt) 521 } 522 } 523 } 524 525 // TODO(gvisor.dev/issue/1035): Linux special-cases umount of the caller's 526 // root, which we don't implement yet (we'll just fail it since the caller 527 // holds a reference on it). 528 529 vfs.mounts.seq.BeginWrite() 530 if opts.Flags&linux.MNT_DETACH == 0 { 531 if len(vd.mount.children) != 0 { 532 vfs.mounts.seq.EndWrite() 533 vfs.mountMu.Unlock() 534 return linuxerr.EBUSY 535 } 536 // We are holding a reference on vd.mount. 537 expectedRefs := int64(1) 538 if !vd.mount.umounted { 539 expectedRefs = 2 540 } 541 if vd.mount.refs.Load()&^math.MinInt64 != expectedRefs { // mask out MSB 542 vfs.mounts.seq.EndWrite() 543 vfs.mountMu.Unlock() 544 return linuxerr.EBUSY 545 } 546 } 547 var ( 548 vdsToDecRef []VirtualDentry 549 mountsToDecRef []*Mount 550 ) 551 for _, mnt := range umountTree { 552 vdsToDecRef, mountsToDecRef = vfs.umountRecursiveLocked(mnt, &umountRecursiveOptions{ 553 eager: opts.Flags&linux.MNT_DETACH == 0, 554 disconnectHierarchy: true, 555 }, vdsToDecRef, mountsToDecRef) 556 } 557 vfs.mounts.seq.EndWrite() 558 vfs.mountMu.Unlock() 559 for _, vd := range vdsToDecRef { 560 vd.DecRef(ctx) 561 } 562 for _, m := range mountsToDecRef { 563 m.DecRef(ctx) 564 } 565 return nil 566 } 567 568 // +stateify savable 569 type umountRecursiveOptions struct { 570 // If eager is true, ensure that future calls to Mount.tryIncMountedRef() 571 // on umounted mounts fail. 572 // 573 // eager is analogous to Linux's UMOUNT_SYNC. 574 eager bool 575 576 // If disconnectHierarchy is true, Mounts that are umounted hierarchically 577 // should be disconnected from their parents. (Mounts whose parents are not 578 // umounted, which in most cases means the Mount passed to the initial call 579 // to umountRecursiveLocked, are unconditionally disconnected for 580 // consistency with Linux.) 581 // 582 // disconnectHierarchy is analogous to Linux's !UMOUNT_CONNECTED. 583 disconnectHierarchy bool 584 } 585 586 // umountRecursiveLocked marks mnt and its descendants as umounted. It does not 587 // release mount or dentry references; instead, it appends VirtualDentries and 588 // Mounts on which references must be dropped to vdsToDecRef and mountsToDecRef 589 // respectively, and returns updated slices. (This is necessary because 590 // filesystem locks possibly taken by DentryImpl.DecRef() may precede 591 // vfs.mountMu in the lock order, and Mount.DecRef() may lock vfs.mountMu.) 592 // 593 // umountRecursiveLocked is analogous to Linux's fs/namespace.c:umount_tree(). 594 // 595 // Preconditions: 596 // - vfs.mountMu must be locked. 597 // - vfs.mounts.seq must be in a writer critical section. 598 // 599 // +checklocks:vfs.mountMu 600 func (vfs *VirtualFilesystem) umountRecursiveLocked(mnt *Mount, opts *umountRecursiveOptions, vdsToDecRef []VirtualDentry, mountsToDecRef []*Mount) ([]VirtualDentry, []*Mount) { 601 if !mnt.umounted { 602 mnt.umounted = true 603 mountsToDecRef = append(mountsToDecRef, mnt) 604 if parent := mnt.parent(); parent != nil && (opts.disconnectHierarchy || !parent.umounted) { 605 vdsToDecRef = append(vdsToDecRef, vfs.disconnectLocked(mnt)) 606 } 607 if mnt.propType == Shared { 608 vfs.setPropagation(mnt, Private) 609 } 610 } 611 if opts.eager { 612 for { 613 refs := mnt.refs.Load() 614 if refs < 0 { 615 break 616 } 617 if mnt.refs.CompareAndSwap(refs, refs|math.MinInt64) { 618 break 619 } 620 } 621 } 622 for child := range mnt.children { 623 vdsToDecRef, mountsToDecRef = vfs.umountRecursiveLocked(child, opts, vdsToDecRef, mountsToDecRef) 624 } 625 return vdsToDecRef, mountsToDecRef 626 } 627 628 // connectLocked makes vd the mount parent/point for mnt. It consumes 629 // references held by vd. 630 // 631 // Preconditions: 632 // - vfs.mountMu must be locked. 633 // - vfs.mounts.seq must be in a writer critical section. 634 // - d.mu must be locked. 635 // - mnt.parent() == nil, i.e. mnt must not already be connected. 636 func (vfs *VirtualFilesystem) connectLocked(mnt *Mount, vd VirtualDentry, mntns *MountNamespace) { 637 if checkInvariants { 638 if mnt.parent() != nil { 639 panic("VFS.connectLocked called on connected mount") 640 } 641 } 642 mnt.IncRef() // dropped by callers of umountRecursiveLocked 643 mnt.setKey(vd) 644 if vd.mount.children == nil { 645 vd.mount.children = make(map[*Mount]struct{}) 646 } 647 vd.mount.children[mnt] = struct{}{} 648 vd.dentry.mounts.Add(1) 649 mnt.ns = mntns 650 mntns.mountpoints[vd.dentry]++ 651 mntns.mounts++ 652 vfs.mounts.insertSeqed(mnt) 653 vfsmpmounts, ok := vfs.mountpoints[vd.dentry] 654 if !ok { 655 vfsmpmounts = make(map[*Mount]struct{}) 656 vfs.mountpoints[vd.dentry] = vfsmpmounts 657 } 658 vfsmpmounts[mnt] = struct{}{} 659 vfs.maybeResolveMountPromise(vd) 660 } 661 662 // disconnectLocked makes vd have no mount parent/point and returns its old 663 // mount parent/point with a reference held. 664 // 665 // Preconditions: 666 // - vfs.mountMu must be locked. 667 // - vfs.mounts.seq must be in a writer critical section. 668 // - mnt.parent() != nil. 669 func (vfs *VirtualFilesystem) disconnectLocked(mnt *Mount) VirtualDentry { 670 vd := mnt.getKey() 671 if checkInvariants { 672 if vd.mount == nil { 673 panic("VFS.disconnectLocked called on disconnected mount") 674 } 675 if mnt.ns.mountpoints[vd.dentry] == 0 { 676 panic("VFS.disconnectLocked called on dentry with zero mountpoints.") 677 } 678 if mnt.ns.mounts == 0 { 679 panic("VFS.disconnectLocked called on namespace with zero mounts.") 680 } 681 } 682 delete(vd.mount.children, mnt) 683 vd.dentry.mounts.Add(math.MaxUint32) // -1 684 mnt.ns.mountpoints[vd.dentry]-- 685 mnt.ns.mounts-- 686 if mnt.ns.mountpoints[vd.dentry] == 0 { 687 delete(mnt.ns.mountpoints, vd.dentry) 688 } 689 vfs.mounts.removeSeqed(mnt) 690 mnt.loadKey(VirtualDentry{}) // Clear mnt.key. 691 vfsmpmounts := vfs.mountpoints[vd.dentry] 692 delete(vfsmpmounts, mnt) 693 if len(vfsmpmounts) == 0 { 694 delete(vfs.mountpoints, vd.dentry) 695 } 696 return vd 697 } 698 699 // tryIncMountedRef increments mnt's reference count and returns true. If mnt's 700 // reference count is already zero, or has been eagerly umounted, 701 // tryIncMountedRef does nothing and returns false. 702 // 703 // tryIncMountedRef does not require that a reference is held on mnt. 704 func (mnt *Mount) tryIncMountedRef() bool { 705 for { 706 r := mnt.refs.Load() 707 if r <= 0 { // r < 0 => MSB set => eagerly unmounted 708 return false 709 } 710 if mnt.refs.CompareAndSwap(r, r+1) { 711 if mnt.LogRefs() { 712 refs.LogTryIncRef(mnt, r+1) 713 } 714 return true 715 } 716 } 717 } 718 719 // IncRef increments mnt's reference count. 720 func (mnt *Mount) IncRef() { 721 // In general, negative values for mnt.refs are valid because the MSB is 722 // the eager-unmount bit. 723 r := mnt.refs.Add(1) 724 if mnt.LogRefs() { 725 refs.LogIncRef(mnt, r) 726 } 727 } 728 729 // DecRef decrements mnt's reference count. 730 func (mnt *Mount) DecRef(ctx context.Context) { 731 r := mnt.refs.Add(-1) 732 if mnt.LogRefs() { 733 refs.LogDecRef(mnt, r) 734 } 735 if r&^math.MinInt64 == 0 { // mask out MSB 736 refs.Unregister(mnt) 737 mnt.destroy(ctx) 738 } 739 } 740 741 func (mnt *Mount) destroy(ctx context.Context) { 742 var vd VirtualDentry 743 if mnt.parent() != nil { 744 mnt.vfs.mountMu.Lock() 745 mnt.vfs.mounts.seq.BeginWrite() 746 vd = mnt.vfs.disconnectLocked(mnt) 747 mnt.vfs.mounts.seq.EndWrite() 748 mnt.vfs.mountMu.Unlock() 749 } 750 if mnt.root != nil { 751 mnt.root.DecRef(ctx) 752 } 753 mnt.fs.DecRef(ctx) 754 if vd.Ok() { 755 vd.DecRef(ctx) 756 } 757 } 758 759 // RefType implements refs.CheckedObject.Type. 760 func (mnt *Mount) RefType() string { 761 return "vfs.Mount" 762 } 763 764 // LeakMessage implements refs.CheckedObject.LeakMessage. 765 func (mnt *Mount) LeakMessage() string { 766 return fmt.Sprintf("[vfs.Mount %p] reference count of %d instead of 0", mnt, mnt.refs.Load()) 767 } 768 769 // LogRefs implements refs.CheckedObject.LogRefs. 770 // 771 // This should only be set to true for debugging purposes, as it can generate an 772 // extremely large amount of output and drastically degrade performance. 773 func (mnt *Mount) LogRefs() bool { 774 return false 775 } 776 777 // DecRef decrements mntns' reference count. 778 func (mntns *MountNamespace) DecRef(ctx context.Context) { 779 vfs := mntns.root.fs.VirtualFilesystem() 780 mntns.MountNamespaceRefs.DecRef(func() { 781 vfs.mountMu.Lock() 782 vfs.mounts.seq.BeginWrite() 783 vdsToDecRef, mountsToDecRef := vfs.umountRecursiveLocked(mntns.root, &umountRecursiveOptions{ 784 disconnectHierarchy: true, 785 }, nil, nil) 786 vfs.mounts.seq.EndWrite() 787 vfs.mountMu.Unlock() 788 for _, vd := range vdsToDecRef { 789 vd.DecRef(ctx) 790 } 791 for _, mnt := range mountsToDecRef { 792 mnt.DecRef(ctx) 793 } 794 }) 795 } 796 797 // getMountAt returns the last Mount in the stack mounted at (mnt, d). It takes 798 // a reference on the returned Mount. If (mnt, d) is not a mount point, 799 // getMountAt returns nil. 800 // 801 // getMountAt is analogous to Linux's fs/namei.c:follow_mount(). 802 // 803 // Preconditions: References are held on mnt and d. 804 func (vfs *VirtualFilesystem) getMountAt(ctx context.Context, mnt *Mount, d *Dentry) *Mount { 805 // The first mount is special-cased: 806 // 807 // - The caller is assumed to have checked d.isMounted() already. (This 808 // isn't a precondition because it doesn't matter for correctness.) 809 // 810 // - We return nil, instead of mnt, if there is no mount at (mnt, d). 811 // 812 // - We don't drop the caller's references on mnt and d. 813 retryFirst: 814 next := vfs.mounts.Lookup(mnt, d) 815 if next == nil { 816 return nil 817 } 818 if !next.tryIncMountedRef() { 819 // Raced with umount. 820 goto retryFirst 821 } 822 mnt = next 823 d = next.root 824 // We don't need to take Dentry refs anywhere in this function because 825 // Mounts hold references on Mount.root, which is immutable. 826 for d.isMounted() { 827 next := vfs.mounts.Lookup(mnt, d) 828 if next == nil { 829 break 830 } 831 if !next.tryIncMountedRef() { 832 // Raced with umount. 833 continue 834 } 835 mnt.DecRef(ctx) 836 mnt = next 837 d = next.root 838 } 839 return mnt 840 } 841 842 // getMountpointAt returns the mount point for the stack of Mounts including 843 // mnt. It takes a reference on the returned VirtualDentry. If no such mount 844 // point exists (i.e. mnt is a root mount), getMountpointAt returns (nil, nil). 845 // 846 // Preconditions: 847 // - References are held on mnt and root. 848 // - vfsroot is not (mnt, mnt.root). 849 func (vfs *VirtualFilesystem) getMountpointAt(ctx context.Context, mnt *Mount, vfsroot VirtualDentry) VirtualDentry { 850 // The first mount is special-cased: 851 // 852 // - The caller must have already checked mnt against vfsroot. 853 // 854 // - We return nil, instead of mnt, if there is no mount point for mnt. 855 // 856 // - We don't drop the caller's reference on mnt. 857 retryFirst: 858 epoch := vfs.mounts.seq.BeginRead() 859 parent, point := mnt.parent(), mnt.point() 860 if !vfs.mounts.seq.ReadOk(epoch) { 861 goto retryFirst 862 } 863 if parent == nil { 864 return VirtualDentry{} 865 } 866 if !parent.tryIncMountedRef() { 867 // Raced with umount. 868 goto retryFirst 869 } 870 if !point.TryIncRef() { 871 // Since Mount holds a reference on Mount.key.point, this can only 872 // happen due to a racing change to Mount.key. 873 parent.DecRef(ctx) 874 goto retryFirst 875 } 876 if !vfs.mounts.seq.ReadOk(epoch) { 877 point.DecRef(ctx) 878 parent.DecRef(ctx) 879 goto retryFirst 880 } 881 mnt = parent 882 d := point 883 for { 884 if mnt == vfsroot.mount && d == vfsroot.dentry { 885 break 886 } 887 if d != mnt.root { 888 break 889 } 890 retryNotFirst: 891 epoch := vfs.mounts.seq.BeginRead() 892 parent, point := mnt.parent(), mnt.point() 893 if !vfs.mounts.seq.ReadOk(epoch) { 894 goto retryNotFirst 895 } 896 if parent == nil { 897 break 898 } 899 if !parent.tryIncMountedRef() { 900 // Raced with umount. 901 goto retryNotFirst 902 } 903 if !point.TryIncRef() { 904 // Since Mount holds a reference on Mount.key.point, this can 905 // only happen due to a racing change to Mount.key. 906 parent.DecRef(ctx) 907 goto retryNotFirst 908 } 909 if !vfs.mounts.seq.ReadOk(epoch) { 910 point.DecRef(ctx) 911 parent.DecRef(ctx) 912 goto retryNotFirst 913 } 914 d.DecRef(ctx) 915 mnt.DecRef(ctx) 916 mnt = parent 917 d = point 918 } 919 return VirtualDentry{mnt, d} 920 } 921 922 // PivotRoot makes location pointed to by newRootPop the root of the current 923 // namespace, and moves the current root to the location pointed to by 924 // putOldPop. 925 func (vfs *VirtualFilesystem) PivotRoot(ctx context.Context, creds *auth.Credentials, newRootPop *PathOperation, putOldPop *PathOperation) error { 926 newRootVd, err := vfs.GetDentryAt(ctx, creds, newRootPop, &GetDentryOptions{CheckSearchable: true}) 927 if err != nil { 928 return err 929 } 930 defer newRootVd.DecRef(ctx) 931 putOldVd, err := vfs.GetDentryAt(ctx, creds, putOldPop, &GetDentryOptions{CheckSearchable: true}) 932 if err != nil { 933 return err 934 } 935 defer putOldVd.DecRef(ctx) 936 rootVd := RootFromContext(ctx) 937 defer rootVd.DecRef(ctx) 938 939 retry: 940 epoch := vfs.mounts.seq.BeginRead() 941 // Neither new_root nor put_old can be on the same mount as the current 942 //root mount. 943 if newRootVd.mount == rootVd.mount || putOldVd.mount == rootVd.mount { 944 return linuxerr.EBUSY 945 } 946 // new_root must be a mountpoint. 947 if newRootVd.mount.root != newRootVd.dentry { 948 return linuxerr.EINVAL 949 } 950 // put_old must be at or underneath new_root. 951 path, err := vfs.PathnameReachable(ctx, newRootVd, putOldVd) 952 if err != nil || len(path) == 0 { 953 return linuxerr.EINVAL 954 } 955 // The current root directory must be a mountpoint 956 // (in the case it has been chrooted). 957 if rootVd.mount.root != rootVd.dentry { 958 return linuxerr.EINVAL 959 } 960 // The current root and the new root cannot be on the rootfs mount. 961 if rootVd.mount.parent() == nil || newRootVd.mount.parent() == nil { 962 return linuxerr.EINVAL 963 } 964 // The current root and the new root must be in the context's mount namespace. 965 ns := MountNamespaceFromContext(ctx) 966 defer ns.DecRef(ctx) 967 vfs.mountMu.Lock() 968 if rootVd.mount.ns != ns || newRootVd.mount.ns != ns { 969 vfs.mountMu.Unlock() 970 return linuxerr.EINVAL 971 } 972 973 // Either the mount point at new_root, or the parent mount of that mount 974 // point, has propagation type MS_SHARED. 975 if newRootParent := newRootVd.mount.parent(); newRootVd.mount.propType == Shared || newRootParent.propType == Shared { 976 vfs.mountMu.Unlock() 977 return linuxerr.EINVAL 978 } 979 // put_old is a mount point and has the propagation type MS_SHARED. 980 if putOldVd.mount.root == putOldVd.dentry && putOldVd.mount.propType == Shared { 981 vfs.mountMu.Unlock() 982 return linuxerr.EINVAL 983 } 984 985 if !vfs.mounts.seq.BeginWriteOk(epoch) { 986 // Checks above raced with a mount change. 987 vfs.mountMu.Unlock() 988 goto retry 989 } 990 defer vfs.mountMu.Unlock() 991 mp := vfs.disconnectLocked(newRootVd.mount) 992 mp.DecRef(ctx) 993 rootMp := vfs.disconnectLocked(rootVd.mount) 994 995 putOldVd.IncRef() 996 putOldVd.dentry.mu.Lock() 997 vfs.connectLocked(rootVd.mount, putOldVd, ns) 998 putOldVd.dentry.mu.Unlock() 999 1000 rootMp.dentry.mu.Lock() 1001 vfs.connectLocked(newRootVd.mount, rootMp, ns) 1002 rootMp.dentry.mu.Unlock() 1003 vfs.mounts.seq.EndWrite() 1004 1005 newRootVd.mount.DecRef(ctx) 1006 rootVd.mount.DecRef(ctx) 1007 return nil 1008 } 1009 1010 // SetMountReadOnly sets the mount as ReadOnly. 1011 func (vfs *VirtualFilesystem) SetMountReadOnly(mnt *Mount, ro bool) error { 1012 vfs.mountMu.Lock() 1013 defer vfs.mountMu.Unlock() 1014 return mnt.setReadOnlyLocked(ro) 1015 } 1016 1017 // CheckBeginWrite increments the counter of in-progress write operations on 1018 // mnt. If mnt is mounted MS_RDONLY, CheckBeginWrite does nothing and returns 1019 // EROFS. 1020 // 1021 // If CheckBeginWrite succeeds, EndWrite must be called when the write 1022 // operation is finished. 1023 func (mnt *Mount) CheckBeginWrite() error { 1024 if mnt.writers.Add(1) < 0 { 1025 mnt.writers.Add(-1) 1026 return linuxerr.EROFS 1027 } 1028 return nil 1029 } 1030 1031 // EndWrite indicates that a write operation signaled by a previous successful 1032 // call to CheckBeginWrite has finished. 1033 func (mnt *Mount) EndWrite() { 1034 mnt.writers.Add(-1) 1035 } 1036 1037 // Preconditions: VirtualFilesystem.mountMu must be locked. 1038 func (mnt *Mount) setReadOnlyLocked(ro bool) error { 1039 if oldRO := mnt.writers.Load() < 0; oldRO == ro { 1040 return nil 1041 } 1042 if ro { 1043 if !mnt.writers.CompareAndSwap(0, math.MinInt64) { 1044 return linuxerr.EBUSY 1045 } 1046 return nil 1047 } 1048 // Unset MSB without dropping any temporary increments from failed calls to 1049 // mnt.CheckBeginWrite(). 1050 mnt.writers.Add(math.MinInt64) 1051 return nil 1052 } 1053 1054 // ReadOnly returns true if mount is readonly. 1055 func (mnt *Mount) ReadOnly() bool { 1056 return mnt.writers.Load() < 0 1057 } 1058 1059 // Filesystem returns the mounted Filesystem. It does not take a reference on 1060 // the returned Filesystem. 1061 func (mnt *Mount) Filesystem() *Filesystem { 1062 return mnt.fs 1063 } 1064 1065 // submountsLocked returns this Mount and all Mounts that are descendents of 1066 // it. 1067 // 1068 // Precondition: mnt.vfs.mountMu must be held. 1069 func (mnt *Mount) submountsLocked() []*Mount { 1070 mounts := []*Mount{mnt} 1071 for m := range mnt.children { 1072 mounts = append(mounts, m.submountsLocked()...) 1073 } 1074 return mounts 1075 } 1076 1077 // Root returns the mount's root. It does not take a reference on the returned 1078 // Dentry. 1079 func (mnt *Mount) Root() *Dentry { 1080 return mnt.root 1081 } 1082 1083 // Root returns mntns' root. It does not take a reference on the returned 1084 // Dentry. 1085 func (mntns *MountNamespace) Root() VirtualDentry { 1086 vd := VirtualDentry{ 1087 mount: mntns.root, 1088 dentry: mntns.root.root, 1089 } 1090 return vd 1091 } 1092 1093 // GenerateProcMounts emits the contents of /proc/[pid]/mounts for vfs to buf. 1094 // 1095 // Preconditions: taskRootDir.Ok(). 1096 func (vfs *VirtualFilesystem) GenerateProcMounts(ctx context.Context, taskRootDir VirtualDentry, buf *bytes.Buffer) { 1097 rootMnt := taskRootDir.mount 1098 1099 vfs.mountMu.Lock() 1100 mounts := rootMnt.submountsLocked() 1101 // Take a reference on mounts since we need to drop vfs.mountMu before 1102 // calling vfs.PathnameReachable() (=> FilesystemImpl.PrependPath()). 1103 for _, mnt := range mounts { 1104 mnt.IncRef() 1105 } 1106 vfs.mountMu.Unlock() 1107 defer func() { 1108 for _, mnt := range mounts { 1109 mnt.DecRef(ctx) 1110 } 1111 }() 1112 sort.Slice(mounts, func(i, j int) bool { return mounts[i].ID < mounts[j].ID }) 1113 1114 for _, mnt := range mounts { 1115 // Get the path to this mount relative to task root. 1116 mntRootVD := VirtualDentry{ 1117 mount: mnt, 1118 dentry: mnt.root, 1119 } 1120 path, err := vfs.PathnameReachable(ctx, taskRootDir, mntRootVD) 1121 if err != nil { 1122 // For some reason we didn't get a path. Log a warning 1123 // and run with empty path. 1124 ctx.Warningf("VFS.GenerateProcMounts: error getting pathname for mount root %+v: %v", mnt.root, err) 1125 path = "" 1126 } 1127 if path == "" { 1128 // Either an error occurred, or path is not reachable 1129 // from root. 1130 break 1131 } 1132 1133 opts := "rw" 1134 if mnt.ReadOnly() { 1135 opts = "ro" 1136 } 1137 if mnt.Flags.NoATime { 1138 opts = ",noatime" 1139 } 1140 if mnt.Flags.NoExec { 1141 opts += ",noexec" 1142 } 1143 if mopts := mnt.fs.Impl().MountOptions(); mopts != "" { 1144 opts += "," + mopts 1145 } 1146 1147 // Format: 1148 // <special device or remote filesystem> <mount point> <filesystem type> <mount options> <needs dump> <fsck order> 1149 // 1150 // The "needs dump" and "fsck order" flags are always 0, which 1151 // is allowed. 1152 fmt.Fprintf(buf, "%s %s %s %s %d %d\n", "none", path, mnt.fs.FilesystemType().Name(), opts, 0, 0) 1153 } 1154 } 1155 1156 // GenerateProcMountInfo emits the contents of /proc/[pid]/mountinfo for vfs to 1157 // buf. 1158 // 1159 // Preconditions: taskRootDir.Ok(). 1160 func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRootDir VirtualDentry, buf *bytes.Buffer) { 1161 rootMnt := taskRootDir.mount 1162 1163 vfs.mountMu.Lock() 1164 mounts := rootMnt.submountsLocked() 1165 // Take a reference on mounts since we need to drop vfs.mountMu before 1166 // calling vfs.PathnameReachable() (=> FilesystemImpl.PrependPath()) or 1167 // vfs.StatAt() (=> FilesystemImpl.StatAt()). 1168 for _, mnt := range mounts { 1169 mnt.IncRef() 1170 } 1171 vfs.mountMu.Unlock() 1172 defer func() { 1173 for _, mnt := range mounts { 1174 mnt.DecRef(ctx) 1175 } 1176 }() 1177 sort.Slice(mounts, func(i, j int) bool { return mounts[i].ID < mounts[j].ID }) 1178 1179 creds := auth.CredentialsFromContext(ctx) 1180 for _, mnt := range mounts { 1181 // Get the path to this mount relative to task root. 1182 mntRootVD := VirtualDentry{ 1183 mount: mnt, 1184 dentry: mnt.root, 1185 } 1186 pathFromRoot, err := vfs.PathnameReachable(ctx, taskRootDir, mntRootVD) 1187 if err != nil { 1188 // For some reason we didn't get a path. Log a warning 1189 // and run with empty path. 1190 ctx.Warningf("VFS.GenerateProcMountInfo: error getting pathname for mount root %+v: %v", mnt.root, err) 1191 continue 1192 } 1193 if pathFromRoot == "" { 1194 // The path is not reachable from root. 1195 continue 1196 } 1197 var pathFromFS string 1198 pathFromFS, err = vfs.PathnameInFilesystem(ctx, mntRootVD) 1199 if err != nil { 1200 // For some reason we didn't get a path. Log a warning 1201 // and run with empty path. 1202 ctx.Warningf("VFS.GenerateProcMountInfo: error getting pathname for mount root %+v: %v", mnt.root, err) 1203 continue 1204 } 1205 if pathFromFS == "" { 1206 // The path is not reachable from root. 1207 continue 1208 } 1209 // Stat the mount root to get the major/minor device numbers. 1210 pop := &PathOperation{ 1211 Root: mntRootVD, 1212 Start: mntRootVD, 1213 } 1214 statx, err := vfs.StatAt(ctx, creds, pop, &StatOptions{}) 1215 if err != nil { 1216 // Well that's not good. Ignore this mount. 1217 ctx.Warningf("VFS.GenerateProcMountInfo: failed to stat mount root %+v: %v", mnt.root, err) 1218 continue 1219 } 1220 1221 // Format: 1222 // 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue 1223 // (1)(2)(3) (4) (5) (6) (7) (8) (9) (10) (11) 1224 1225 // (1) Mount ID. 1226 fmt.Fprintf(buf, "%d ", mnt.ID) 1227 1228 // (2) Parent ID (or this ID if there is no parent). 1229 // Note that even if the call to mnt.parent() races with Mount 1230 // destruction (which is possible since we're not holding vfs.mountMu), 1231 // its Mount.ID will still be valid. 1232 pID := mnt.ID 1233 if p := mnt.parent(); p != nil { 1234 pID = p.ID 1235 } 1236 fmt.Fprintf(buf, "%d ", pID) 1237 1238 // (3) Major:Minor device ID. We don't have a superblock, so we 1239 // just use the root inode device number. 1240 fmt.Fprintf(buf, "%d:%d ", statx.DevMajor, statx.DevMinor) 1241 1242 // (4) Root: the pathname of the directory in the filesystem 1243 // which forms the root of this mount. 1244 fmt.Fprintf(buf, "%s ", manglePath(pathFromFS)) 1245 1246 // (5) Mount point (relative to process root). 1247 fmt.Fprintf(buf, "%s ", manglePath(pathFromRoot)) 1248 1249 // (6) Mount options. 1250 opts := "rw" 1251 if mnt.ReadOnly() { 1252 opts = "ro" 1253 } 1254 if mnt.Flags.NoATime { 1255 opts = ",noatime" 1256 } 1257 if mnt.Flags.NoExec { 1258 opts += ",noexec" 1259 } 1260 fmt.Fprintf(buf, "%s ", opts) 1261 1262 // (7) Optional fields: zero or more fields of the form "tag[:value]". 1263 fmt.Fprintf(buf, "%s ", mnt.generateOptionalTags()) 1264 // (8) Separator: the end of the optional fields is marked by a single hyphen. 1265 fmt.Fprintf(buf, "- ") 1266 1267 // (9) Filesystem type. 1268 fmt.Fprintf(buf, "%s ", mnt.fs.FilesystemType().Name()) 1269 1270 // (10) Mount source: filesystem-specific information or "none". 1271 fmt.Fprintf(buf, "none ") 1272 1273 // (11) Superblock options, and final newline. 1274 fmt.Fprintf(buf, "%s\n", superBlockOpts(pathFromRoot, mnt)) 1275 } 1276 } 1277 1278 // manglePath replaces ' ', '\t', '\n', and '\\' with their octal equivalents. 1279 // See Linux fs/seq_file.c:mangle_path. 1280 func manglePath(p string) string { 1281 r := strings.NewReplacer(" ", "\\040", "\t", "\\011", "\n", "\\012", "\\", "\\134") 1282 return r.Replace(p) 1283 } 1284 1285 // superBlockOpts returns the super block options string for the the mount at 1286 // the given path. 1287 func superBlockOpts(mountPath string, mnt *Mount) string { 1288 // Compose super block options by combining global mount flags with 1289 // FS-specific mount options. 1290 opts := "rw" 1291 if mnt.ReadOnly() { 1292 opts = "ro" 1293 } 1294 1295 if mopts := mnt.fs.Impl().MountOptions(); mopts != "" { 1296 opts += "," + mopts 1297 } 1298 1299 // NOTE(b/147673608): If the mount is a ramdisk-based fake cgroupfs, we also 1300 // need to include the cgroup name in the options. For now we just read that 1301 // from the path. Note that this is only possible when "cgroup" isn't 1302 // registered as a valid filesystem type. 1303 // 1304 // TODO(gvisor.dev/issue/190): Once we removed fake cgroupfs support, we 1305 // should remove this. 1306 if cgroupfs := mnt.vfs.getFilesystemType("cgroup"); cgroupfs != nil && cgroupfs.opts.AllowUserMount { 1307 // Real cgroupfs available. 1308 return opts 1309 } 1310 if mnt.fs.FilesystemType().Name() == "cgroup" { 1311 splitPath := strings.Split(mountPath, "/") 1312 cgroupType := splitPath[len(splitPath)-1] 1313 opts += "," + cgroupType 1314 } 1315 1316 return opts 1317 } 1318 1319 // allocateGroupID returns a new mount group id if one is available, and 1320 // error otherwise. If the group ID bitmap is full, double the size of the 1321 // bitmap before allocating the new group id. 1322 // 1323 // +checklocks:vfs.mountMu 1324 func (vfs *VirtualFilesystem) allocateGroupID() (uint32, error) { 1325 groupID, err := vfs.groupIDBitmap.FirstZero(1) 1326 if err != nil { 1327 if err := vfs.groupIDBitmap.Grow(uint32(vfs.groupIDBitmap.Size())); err != nil { 1328 return 0, err 1329 } 1330 } 1331 vfs.groupIDBitmap.Add(groupID) 1332 return groupID, nil 1333 } 1334 1335 // freeGroupID marks a groupID as available for reuse. 1336 // 1337 // +checklocks:vfs.mountMu 1338 func (vfs *VirtualFilesystem) freeGroupID(id uint32) { 1339 vfs.groupIDBitmap.Remove(id) 1340 }