gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/vfs/vfs.go (about) 1 // Copyright 2019 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package vfs implements a virtual filesystem layer. 16 // 17 // Lock order: 18 // 19 // EpollInstance.interestMu 20 // FileDescription.epollMu 21 // Locks acquired by FilesystemImpl/FileDescriptionImpl methods (except IsDescendant) 22 // VirtualFilesystem.mountMu 23 // Dentry.mu 24 // Locks acquired by FilesystemImpls between Prepare{Delete,Rename}Dentry and Commit{Delete,Rename*}Dentry 25 // VirtualFilesystem.filesystemsMu 26 // fdnotifier.notifier.mu 27 // EpollInstance.readyMu 28 // Inotify.mu 29 // Watches.mu 30 // Inotify.evMu 31 // VirtualFilesystem.fsTypesMu 32 // 33 // Locking Dentry.mu in multiple Dentries requires holding 34 // VirtualFilesystem.mountMu. Locking EpollInstance.interestMu in multiple 35 // EpollInstances requires holding epollCycleMu. 36 // 37 // FilesystemImpl locks are not held during calls to FilesystemImpl.IsDescendant 38 // since it's called under mountMu. It's possible for concurrent mutation 39 // to dentry ancestors during calls IsDescendant. Callers should take 40 // appropriate caution when using this method. 41 package vfs 42 43 import ( 44 "fmt" 45 "path" 46 "time" 47 48 "gvisor.dev/gvisor/pkg/abi/linux" 49 "gvisor.dev/gvisor/pkg/atomicbitops" 50 "gvisor.dev/gvisor/pkg/bitmap" 51 "gvisor.dev/gvisor/pkg/context" 52 "gvisor.dev/gvisor/pkg/errors/linuxerr" 53 "gvisor.dev/gvisor/pkg/eventchannel" 54 "gvisor.dev/gvisor/pkg/fspath" 55 "gvisor.dev/gvisor/pkg/log" 56 "gvisor.dev/gvisor/pkg/refs" 57 "gvisor.dev/gvisor/pkg/sentry/fsmetric" 58 "gvisor.dev/gvisor/pkg/sentry/kernel/auth" 59 "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" 60 epb "gvisor.dev/gvisor/pkg/sentry/vfs/events_go_proto" 61 "gvisor.dev/gvisor/pkg/sync" 62 "gvisor.dev/gvisor/pkg/waiter" 63 ) 64 65 // How long to wait for a mount promise before proceeding with the VFS 66 // operation. This should be configurable by the user eventually. 67 const mountPromiseTimeout = 30 * time.Second 68 69 type mountPromise struct { 70 wq *waiter.Queue 71 resolved atomicbitops.Bool 72 } 73 74 // A VirtualFilesystem (VFS for short) combines Filesystems in trees of Mounts. 75 // 76 // There is no analogue to the VirtualFilesystem type in Linux, as the 77 // equivalent state in Linux is global. 78 // 79 // +stateify savable 80 type VirtualFilesystem struct { 81 // mountMu serializes mount mutations. 82 // 83 // mountMu is analogous to Linux's namespace_sem. 84 mountMu virtualFilesystemMutex `state:"nosave"` 85 86 // mounts maps (mount parent, mount point) pairs to mounts. (Since mounts 87 // are uniquely namespaced, including mount parent in the key correctly 88 // handles both bind mounts and mount namespaces; Linux does the same.) 89 // Synchronization between mutators and readers is provided by mounts.seq; 90 // synchronization between mutators is provided by mountMu. 91 // 92 // mounts is used to follow mount points during path traversal. We use a 93 // single table rather than per-Dentry tables to reduce size (and therefore 94 // cache footprint) for the vast majority of Dentries that are not mount 95 // points. 96 // 97 // mounts is analogous to Linux's mount_hashtable. 98 mounts mountTable `state:".([]*Mount)"` 99 100 // mountpoints maps mount points to mounts at those points in all 101 // namespaces. mountpoints is protected by mountMu. 102 // 103 // mountpoints is used to find mounts that must be umounted due to 104 // removal of a mount point Dentry from another mount namespace. ("A file 105 // or directory that is a mount point in one namespace that is not a mount 106 // point in another namespace, may be renamed, unlinked, or removed 107 // (rmdir(2)) in the mount namespace in which it is not a mount point 108 // (subject to the usual permission checks)." - mount_namespaces(7)) 109 // 110 // mountpoints is analogous to Linux's mountpoint_hashtable. 111 mountpoints map[*Dentry]map[*Mount]struct{} 112 113 // lastMountID is the last allocated mount ID. lastMountID is accessed 114 // using atomic memory operations. 115 lastMountID atomicbitops.Uint64 116 117 // anonMount is a Mount, not included in mounts or mountpoints, 118 // representing an anonFilesystem. anonMount is used to back 119 // VirtualDentries returned by VirtualFilesystem.NewAnonVirtualDentry(). 120 // anonMount is immutable. 121 // 122 // anonMount is analogous to Linux's anon_inode_mnt. 123 anonMount *Mount 124 125 // devices contains all registered Devices. devices is protected by 126 // devicesMu. 127 devicesMu sync.RWMutex `state:"nosave"` 128 devices map[devTuple]*registeredDevice 129 130 // dynCharDevMajorUsed contains all allocated dynamic character device 131 // major numbers. dynCharDevMajor is protected by dynCharDevMajorMu. 132 dynCharDevMajorMu sync.Mutex `state:"nosave"` 133 dynCharDevMajorUsed map[uint32]struct{} 134 135 // anonBlockDevMinor contains all allocated anonymous block device minor 136 // numbers. anonBlockDevMinorNext is a lower bound for the smallest 137 // unallocated anonymous block device number. anonBlockDevMinorNext and 138 // anonBlockDevMinor are protected by anonBlockDevMinorMu. 139 anonBlockDevMinorMu sync.Mutex `state:"nosave"` 140 anonBlockDevMinorNext uint32 141 anonBlockDevMinor map[uint32]struct{} 142 143 // fsTypes contains all registered FilesystemTypes. fsTypes is protected by 144 // fsTypesMu. 145 fsTypesMu sync.RWMutex `state:"nosave"` 146 fsTypes map[string]*registeredFilesystemType 147 148 // filesystems contains all Filesystems. filesystems is protected by 149 // filesystemsMu. 150 filesystemsMu sync.Mutex `state:"nosave"` 151 filesystems map[*Filesystem]struct{} 152 153 // groupIDBitmap tracks which mount group IDs are available for allocation. 154 groupIDBitmap bitmap.Bitmap 155 156 // mountPromises contains all unresolved mount promises. 157 mountPromises sync.Map `state:".(map[VirtualDentry]*mountPromise)"` 158 159 // toDecRef contains all the reference counted objects that needed to be 160 // DecRefd while mountMu was held. It is cleared every time unlockMounts is 161 // called and protected by mountMu. 162 // 163 // +checklocks:mountMu 164 toDecRef map[refs.RefCounter]int 165 } 166 167 // Init initializes a new VirtualFilesystem with no mounts or FilesystemTypes. 168 func (vfs *VirtualFilesystem) Init(ctx context.Context) error { 169 if vfs.mountpoints != nil { 170 panic("VFS already initialized") 171 } 172 vfs.mountpoints = make(map[*Dentry]map[*Mount]struct{}) 173 vfs.devices = make(map[devTuple]*registeredDevice) 174 vfs.dynCharDevMajorUsed = make(map[uint32]struct{}) 175 vfs.anonBlockDevMinorNext = 1 176 vfs.anonBlockDevMinor = make(map[uint32]struct{}) 177 vfs.fsTypes = make(map[string]*registeredFilesystemType) 178 vfs.filesystems = make(map[*Filesystem]struct{}) 179 vfs.mounts.Init() 180 vfs.groupIDBitmap = bitmap.New(1024) 181 vfs.mountMu.Lock() 182 vfs.toDecRef = make(map[refs.RefCounter]int) 183 vfs.mountMu.Unlock() 184 185 // Construct vfs.anonMount. 186 anonfsDevMinor, err := vfs.GetAnonBlockDevMinor() 187 if err != nil { 188 // This shouldn't be possible since anonBlockDevMinorNext was 189 // initialized to 1 above (no device numbers have been allocated yet). 190 panic(fmt.Sprintf("VirtualFilesystem.Init: device number allocation for anonfs failed: %v", err)) 191 } 192 anonfs := anonFilesystem{ 193 devMinor: anonfsDevMinor, 194 } 195 anonfs.vfsfs.Init(vfs, &anonFilesystemType{}, &anonfs) 196 defer anonfs.vfsfs.DecRef(ctx) 197 anonMount := vfs.NewDisconnectedMount(&anonfs.vfsfs, nil, &MountOptions{}) 198 vfs.anonMount = anonMount 199 200 return nil 201 } 202 203 // Release drops references on filesystem objects held by vfs. 204 // 205 // Precondition: This must be called after VFS.Init() has succeeded. 206 func (vfs *VirtualFilesystem) Release(ctx context.Context) { 207 vfs.anonMount.DecRef(ctx) 208 for _, fst := range vfs.fsTypes { 209 fst.fsType.Release(ctx) 210 } 211 } 212 213 // PathOperation specifies the path operated on by a VFS method. 214 // 215 // PathOperation is passed to VFS methods by pointer to reduce memory copying: 216 // it's somewhat large and should never escape. (Options structs are passed by 217 // pointer to VFS and FileDescription methods for the same reason.) 218 // 219 // +stateify savable 220 type PathOperation struct { 221 // Root is the VFS root. References on Root are borrowed from the provider 222 // of the PathOperation. 223 // 224 // Invariants: Root.Ok(). 225 Root VirtualDentry 226 227 // Start is the starting point for the path traversal. References on Start 228 // are borrowed from the provider of the PathOperation (i.e. the caller of 229 // the VFS method to which the PathOperation was passed). 230 // 231 // Invariants: Start.Ok(). If Path.Absolute, then Start == Root. 232 Start VirtualDentry 233 234 // Path is the pathname traversed by this operation. 235 Path fspath.Path 236 237 // If FollowFinalSymlink is true, and the Dentry traversed by the final 238 // path component represents a symbolic link, the symbolic link should be 239 // followed. 240 FollowFinalSymlink bool 241 } 242 243 // AccessAt checks whether a user with creds has access to the file at 244 // the given path. 245 func (vfs *VirtualFilesystem) AccessAt(ctx context.Context, creds *auth.Credentials, ats AccessTypes, pop *PathOperation) error { 246 rp := vfs.getResolvingPath(creds, pop) 247 for { 248 vfs.maybeBlockOnMountPromise(ctx, rp) 249 err := rp.mount.fs.impl.AccessAt(ctx, rp, creds, ats) 250 if err == nil { 251 rp.Release(ctx) 252 return nil 253 } 254 if !rp.handleError(ctx, err) { 255 rp.Release(ctx) 256 return err 257 } 258 } 259 } 260 261 // GetDentryAt returns a VirtualDentry representing the given path, at which a 262 // file must exist. A reference is taken on the returned VirtualDentry. 263 func (vfs *VirtualFilesystem) GetDentryAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetDentryOptions) (VirtualDentry, error) { 264 rp := vfs.getResolvingPath(creds, pop) 265 for { 266 vfs.maybeBlockOnMountPromise(ctx, rp) 267 d, err := rp.mount.fs.impl.GetDentryAt(ctx, rp, *opts) 268 if err == nil { 269 vd := VirtualDentry{ 270 mount: rp.mount, 271 dentry: d, 272 } 273 rp.mount.IncRef() 274 rp.Release(ctx) 275 return vd, nil 276 } 277 if !rp.handleError(ctx, err) { 278 rp.Release(ctx) 279 return VirtualDentry{}, err 280 } 281 } 282 } 283 284 // Preconditions: pop.Path.Begin.Ok(). 285 func (vfs *VirtualFilesystem) getParentDirAndName(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (VirtualDentry, string, error) { 286 rp := vfs.getResolvingPath(creds, pop) 287 for { 288 vfs.maybeBlockOnMountPromise(ctx, rp) 289 parent, err := rp.mount.fs.impl.GetParentDentryAt(ctx, rp) 290 if err == nil { 291 parentVD := VirtualDentry{ 292 mount: rp.mount, 293 dentry: parent, 294 } 295 rp.mount.IncRef() 296 name := rp.Component() 297 rp.Release(ctx) 298 return parentVD, name, nil 299 } 300 if checkInvariants { 301 if rp.canHandleError(err) && rp.Done() { 302 panic(fmt.Sprintf("%T.GetParentDentryAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) 303 } 304 } 305 if !rp.handleError(ctx, err) { 306 rp.Release(ctx) 307 return VirtualDentry{}, "", err 308 } 309 } 310 } 311 312 // LinkAt creates a hard link at newpop representing the existing file at 313 // oldpop. 314 func (vfs *VirtualFilesystem) LinkAt(ctx context.Context, creds *auth.Credentials, oldpop, newpop *PathOperation) error { 315 oldVD, err := vfs.GetDentryAt(ctx, creds, oldpop, &GetDentryOptions{}) 316 if err != nil { 317 return err 318 } 319 320 if !newpop.Path.Begin.Ok() { 321 oldVD.DecRef(ctx) 322 if newpop.Path.Absolute { 323 return linuxerr.EEXIST 324 } 325 return linuxerr.ENOENT 326 } 327 if newpop.FollowFinalSymlink { 328 oldVD.DecRef(ctx) 329 ctx.Warningf("VirtualFilesystem.LinkAt: file creation paths can't follow final symlink") 330 return linuxerr.EINVAL 331 } 332 333 rp := vfs.getResolvingPath(creds, newpop) 334 for { 335 vfs.maybeBlockOnMountPromise(ctx, rp) 336 err := rp.mount.fs.impl.LinkAt(ctx, rp, oldVD) 337 if err == nil { 338 rp.Release(ctx) 339 oldVD.DecRef(ctx) 340 return nil 341 } 342 if checkInvariants { 343 if rp.canHandleError(err) && rp.Done() { 344 panic(fmt.Sprintf("%T.LinkAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) 345 } 346 } 347 if !rp.handleError(ctx, err) { 348 rp.Release(ctx) 349 oldVD.DecRef(ctx) 350 return err 351 } 352 } 353 } 354 355 // MkdirAt creates a directory at the given path. 356 func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MkdirOptions) error { 357 if !pop.Path.Begin.Ok() { 358 // pop.Path should not be empty in operations that create/delete files. 359 // This is consistent with mkdirat(dirfd, "", mode). 360 if pop.Path.Absolute { 361 return linuxerr.EEXIST 362 } 363 return linuxerr.ENOENT 364 } 365 if pop.FollowFinalSymlink { 366 ctx.Warningf("VirtualFilesystem.MkdirAt: file creation paths can't follow final symlink") 367 return linuxerr.EINVAL 368 } 369 // "Under Linux, apart from the permission bits, the S_ISVTX mode bit is 370 // also honored." - mkdir(2) 371 opts.Mode &= 0777 | linux.S_ISVTX 372 373 rp := vfs.getResolvingPath(creds, pop) 374 for { 375 vfs.maybeBlockOnMountPromise(ctx, rp) 376 err := rp.mount.fs.impl.MkdirAt(ctx, rp, *opts) 377 if err == nil { 378 rp.Release(ctx) 379 return nil 380 } 381 if checkInvariants { 382 if rp.canHandleError(err) && rp.Done() { 383 panic(fmt.Sprintf("%T.MkdirAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) 384 } 385 } 386 if !rp.handleError(ctx, err) { 387 rp.Release(ctx) 388 return err 389 } 390 } 391 } 392 393 // MknodAt creates a file of the given mode at the given path. It returns an 394 // error from the linuxerr package. 395 func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MknodOptions) error { 396 if !pop.Path.Begin.Ok() { 397 // pop.Path should not be empty in operations that create/delete files. 398 // This is consistent with mknodat(dirfd, "", mode, dev). 399 if pop.Path.Absolute { 400 return linuxerr.EEXIST 401 } 402 return linuxerr.ENOENT 403 } 404 if pop.FollowFinalSymlink { 405 ctx.Warningf("VirtualFilesystem.MknodAt: file creation paths can't follow final symlink") 406 return linuxerr.EINVAL 407 } 408 409 rp := vfs.getResolvingPath(creds, pop) 410 for { 411 vfs.maybeBlockOnMountPromise(ctx, rp) 412 err := rp.mount.fs.impl.MknodAt(ctx, rp, *opts) 413 if err == nil { 414 rp.Release(ctx) 415 return nil 416 } 417 if checkInvariants { 418 if rp.canHandleError(err) && rp.Done() { 419 panic(fmt.Sprintf("%T.MknodAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) 420 } 421 } 422 if !rp.handleError(ctx, err) { 423 rp.Release(ctx) 424 return err 425 } 426 } 427 } 428 429 // OpenAt returns a FileDescription providing access to the file at the given 430 // path. A reference is taken on the returned FileDescription. 431 func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *OpenOptions) (*FileDescription, error) { 432 fsmetric.Opens.Increment() 433 434 // Remove: 435 // 436 // - O_CLOEXEC, which affects file descriptors and therefore must be 437 // handled outside of VFS. 438 // 439 // - Unknown flags. 440 opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC | linux.O_APPEND | linux.O_NONBLOCK | linux.O_DSYNC | linux.O_ASYNC | linux.O_DIRECT | linux.O_LARGEFILE | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NOATIME | linux.O_SYNC | linux.O_PATH | linux.O_TMPFILE 441 // Linux's __O_SYNC (which we call linux.O_SYNC) implies O_DSYNC. 442 if opts.Flags&linux.O_SYNC != 0 { 443 opts.Flags |= linux.O_DSYNC 444 } 445 // Linux's __O_TMPFILE (which we call linux.O_TMPFILE) must be specified 446 // with O_DIRECTORY and a writable access mode (to ensure that it fails on 447 // filesystem implementations that do not support it). 448 if opts.Flags&linux.O_TMPFILE != 0 { 449 if opts.Flags&linux.O_DIRECTORY == 0 { 450 return nil, linuxerr.EINVAL 451 } 452 if opts.Flags&linux.O_CREAT != 0 { 453 return nil, linuxerr.EINVAL 454 } 455 if opts.Flags&linux.O_ACCMODE == linux.O_RDONLY { 456 return nil, linuxerr.EINVAL 457 } 458 } 459 // O_PATH causes most other flags to be ignored. 460 if opts.Flags&linux.O_PATH != 0 { 461 opts.Flags &= linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_PATH 462 } 463 // "On Linux, the following bits are also honored in mode: [S_ISUID, 464 // S_ISGID, S_ISVTX]" - open(2) 465 opts.Mode &= 0777 | linux.S_ISUID | linux.S_ISGID | linux.S_ISVTX 466 467 if opts.Flags&linux.O_NOFOLLOW != 0 { 468 pop.FollowFinalSymlink = false 469 } 470 if opts.Flags&linux.O_PATH != 0 { 471 return vfs.openOPathFD(ctx, creds, pop, opts.Flags) 472 } 473 rp := vfs.getResolvingPath(creds, pop) 474 if opts.Flags&linux.O_DIRECTORY != 0 { 475 rp.mustBeDir = true 476 } 477 for { 478 vfs.maybeBlockOnMountPromise(ctx, rp) 479 fd, err := rp.mount.fs.impl.OpenAt(ctx, rp, *opts) 480 if err == nil { 481 rp.Release(ctx) 482 483 if opts.FileExec { 484 if fd.Mount().Options().Flags.NoExec { 485 fd.DecRef(ctx) 486 return nil, linuxerr.EACCES 487 } 488 489 // Only a regular file can be executed. 490 stat, err := fd.Stat(ctx, StatOptions{Mask: linux.STATX_TYPE}) 491 if err != nil { 492 fd.DecRef(ctx) 493 return nil, err 494 } 495 if stat.Mask&linux.STATX_TYPE == 0 || stat.Mode&linux.S_IFMT != linux.S_IFREG { 496 fd.DecRef(ctx) 497 return nil, linuxerr.EACCES 498 } 499 } 500 501 fd.Dentry().InotifyWithParent(ctx, linux.IN_OPEN, 0, PathEvent) 502 return fd, nil 503 } 504 if !rp.handleError(ctx, err) { 505 rp.Release(ctx) 506 return nil, err 507 } 508 } 509 } 510 511 // ReadlinkAt returns the target of the symbolic link at the given path. 512 func (vfs *VirtualFilesystem) ReadlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (string, error) { 513 rp := vfs.getResolvingPath(creds, pop) 514 for { 515 vfs.maybeBlockOnMountPromise(ctx, rp) 516 target, err := rp.mount.fs.impl.ReadlinkAt(ctx, rp) 517 if err == nil { 518 rp.Release(ctx) 519 return target, nil 520 } 521 if !rp.handleError(ctx, err) { 522 rp.Release(ctx) 523 return "", err 524 } 525 } 526 } 527 528 // RenameAt renames the file at oldpop to newpop. 529 func (vfs *VirtualFilesystem) RenameAt(ctx context.Context, creds *auth.Credentials, oldpop, newpop *PathOperation, opts *RenameOptions) error { 530 if !oldpop.Path.Begin.Ok() { 531 if oldpop.Path.Absolute { 532 return linuxerr.EBUSY 533 } 534 return linuxerr.ENOENT 535 } 536 if oldpop.FollowFinalSymlink { 537 ctx.Warningf("VirtualFilesystem.RenameAt: source path can't follow final symlink") 538 return linuxerr.EINVAL 539 } 540 541 oldParentVD, oldName, err := vfs.getParentDirAndName(ctx, creds, oldpop) 542 if err != nil { 543 return err 544 } 545 if oldName == "." || oldName == ".." { 546 oldParentVD.DecRef(ctx) 547 return linuxerr.EBUSY 548 } 549 if len(oldName) > linux.NAME_MAX { 550 oldParentVD.DecRef(ctx) 551 return linuxerr.ENAMETOOLONG 552 } 553 554 if !newpop.Path.Begin.Ok() { 555 oldParentVD.DecRef(ctx) 556 if newpop.Path.Absolute { 557 return linuxerr.EBUSY 558 } 559 return linuxerr.ENOENT 560 } 561 if newpop.FollowFinalSymlink { 562 oldParentVD.DecRef(ctx) 563 ctx.Warningf("VirtualFilesystem.RenameAt: destination path can't follow final symlink") 564 return linuxerr.EINVAL 565 } 566 567 rp := vfs.getResolvingPath(creds, newpop) 568 renameOpts := *opts 569 if oldpop.Path.Dir { 570 renameOpts.MustBeDir = true 571 } 572 for { 573 vfs.maybeBlockOnMountPromise(ctx, rp) 574 err := rp.mount.fs.impl.RenameAt(ctx, rp, oldParentVD, oldName, renameOpts) 575 if err == nil { 576 rp.Release(ctx) 577 oldParentVD.DecRef(ctx) 578 return nil 579 } 580 if checkInvariants { 581 if rp.canHandleError(err) && rp.Done() { 582 panic(fmt.Sprintf("%T.RenameAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) 583 } 584 } 585 if !rp.handleError(ctx, err) { 586 rp.Release(ctx) 587 oldParentVD.DecRef(ctx) 588 return err 589 } 590 } 591 } 592 593 // RmdirAt removes the directory at the given path. 594 func (vfs *VirtualFilesystem) RmdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) error { 595 if !pop.Path.Begin.Ok() { 596 // pop.Path should not be empty in operations that create/delete files. 597 // This is consistent with unlinkat(dirfd, "", AT_REMOVEDIR). 598 if pop.Path.Absolute { 599 return linuxerr.EBUSY 600 } 601 return linuxerr.ENOENT 602 } 603 if pop.FollowFinalSymlink { 604 ctx.Warningf("VirtualFilesystem.RmdirAt: file deletion paths can't follow final symlink") 605 return linuxerr.EINVAL 606 } 607 608 rp := vfs.getResolvingPath(creds, pop) 609 for { 610 vfs.maybeBlockOnMountPromise(ctx, rp) 611 err := rp.mount.fs.impl.RmdirAt(ctx, rp) 612 if err == nil { 613 rp.Release(ctx) 614 return nil 615 } 616 if checkInvariants { 617 if rp.canHandleError(err) && rp.Done() { 618 panic(fmt.Sprintf("%T.RmdirAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) 619 } 620 } 621 if !rp.handleError(ctx, err) { 622 rp.Release(ctx) 623 return err 624 } 625 } 626 } 627 628 // SetStatAt changes metadata for the file at the given path. 629 func (vfs *VirtualFilesystem) SetStatAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *SetStatOptions) error { 630 rp := vfs.getResolvingPath(creds, pop) 631 for { 632 vfs.maybeBlockOnMountPromise(ctx, rp) 633 err := rp.mount.fs.impl.SetStatAt(ctx, rp, *opts) 634 if err == nil { 635 rp.Release(ctx) 636 return nil 637 } 638 if !rp.handleError(ctx, err) { 639 rp.Release(ctx) 640 return err 641 } 642 } 643 } 644 645 // StatAt returns metadata for the file at the given path. 646 func (vfs *VirtualFilesystem) StatAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *StatOptions) (linux.Statx, error) { 647 rp := vfs.getResolvingPath(creds, pop) 648 for { 649 vfs.maybeBlockOnMountPromise(ctx, rp) 650 stat, err := rp.mount.fs.impl.StatAt(ctx, rp, *opts) 651 if err == nil { 652 rp.Release(ctx) 653 return stat, nil 654 } 655 if !rp.handleError(ctx, err) { 656 rp.Release(ctx) 657 return linux.Statx{}, err 658 } 659 } 660 } 661 662 // StatFSAt returns metadata for the filesystem containing the file at the 663 // given path. 664 func (vfs *VirtualFilesystem) StatFSAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (linux.Statfs, error) { 665 rp := vfs.getResolvingPath(creds, pop) 666 for { 667 vfs.maybeBlockOnMountPromise(ctx, rp) 668 statfs, err := rp.mount.fs.impl.StatFSAt(ctx, rp) 669 if err == nil { 670 statfs.Flags |= rp.mount.MountFlags() 671 rp.Release(ctx) 672 return statfs, nil 673 } 674 if !rp.handleError(ctx, err) { 675 rp.Release(ctx) 676 return linux.Statfs{}, err 677 } 678 } 679 } 680 681 // SymlinkAt creates a symbolic link at the given path with the given target. 682 func (vfs *VirtualFilesystem) SymlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, target string) error { 683 if !pop.Path.Begin.Ok() { 684 // pop.Path should not be empty in operations that create/delete files. 685 // This is consistent with symlinkat(oldpath, newdirfd, ""). 686 if pop.Path.Absolute { 687 return linuxerr.EEXIST 688 } 689 return linuxerr.ENOENT 690 } 691 if pop.FollowFinalSymlink { 692 ctx.Warningf("VirtualFilesystem.SymlinkAt: file creation paths can't follow final symlink") 693 return linuxerr.EINVAL 694 } 695 696 rp := vfs.getResolvingPath(creds, pop) 697 for { 698 vfs.maybeBlockOnMountPromise(ctx, rp) 699 err := rp.mount.fs.impl.SymlinkAt(ctx, rp, target) 700 if err == nil { 701 rp.Release(ctx) 702 return nil 703 } 704 if checkInvariants { 705 if rp.canHandleError(err) && rp.Done() { 706 panic(fmt.Sprintf("%T.SymlinkAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) 707 } 708 } 709 if !rp.handleError(ctx, err) { 710 rp.Release(ctx) 711 return err 712 } 713 } 714 } 715 716 // UnlinkAt deletes the non-directory file at the given path. 717 func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) error { 718 if !pop.Path.Begin.Ok() { 719 // pop.Path should not be empty in operations that create/delete files. 720 // This is consistent with unlinkat(dirfd, "", 0). 721 if pop.Path.Absolute { 722 return linuxerr.EBUSY 723 } 724 return linuxerr.ENOENT 725 } 726 if pop.FollowFinalSymlink { 727 ctx.Warningf("VirtualFilesystem.UnlinkAt: file deletion paths can't follow final symlink") 728 return linuxerr.EINVAL 729 } 730 731 rp := vfs.getResolvingPath(creds, pop) 732 for { 733 vfs.maybeBlockOnMountPromise(ctx, rp) 734 err := rp.mount.fs.impl.UnlinkAt(ctx, rp) 735 if err == nil { 736 rp.Release(ctx) 737 return nil 738 } 739 if checkInvariants { 740 if rp.canHandleError(err) && rp.Done() { 741 panic(fmt.Sprintf("%T.UnlinkAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) 742 } 743 } 744 if !rp.handleError(ctx, err) { 745 rp.Release(ctx) 746 return err 747 } 748 } 749 } 750 751 // BoundEndpointAt gets the bound endpoint at the given path, if one exists. 752 func (vfs *VirtualFilesystem) BoundEndpointAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *BoundEndpointOptions) (transport.BoundEndpoint, error) { 753 rp := vfs.getResolvingPath(creds, pop) 754 for { 755 vfs.maybeBlockOnMountPromise(ctx, rp) 756 bep, err := rp.mount.fs.impl.BoundEndpointAt(ctx, rp, *opts) 757 if err == nil { 758 rp.Release(ctx) 759 return bep, nil 760 } 761 if checkInvariants { 762 if rp.canHandleError(err) && rp.Done() { 763 panic(fmt.Sprintf("%T.BoundEndpointAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) 764 } 765 } 766 if !rp.handleError(ctx, err) { 767 rp.Release(ctx) 768 return nil, err 769 } 770 } 771 } 772 773 // ListXattrAt returns all extended attribute names for the file at the given 774 // path. 775 func (vfs *VirtualFilesystem) ListXattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, size uint64) ([]string, error) { 776 rp := vfs.getResolvingPath(creds, pop) 777 for { 778 vfs.maybeBlockOnMountPromise(ctx, rp) 779 names, err := rp.mount.fs.impl.ListXattrAt(ctx, rp, size) 780 if err == nil { 781 rp.Release(ctx) 782 return names, nil 783 } 784 if linuxerr.Equals(linuxerr.EOPNOTSUPP, err) { 785 // Linux doesn't actually return EOPNOTSUPP in this case; instead, 786 // fs/xattr.c:vfs_listxattr() falls back to allowing the security 787 // subsystem to return security extended attributes, which by 788 // default don't exist. 789 rp.Release(ctx) 790 return nil, nil 791 } 792 if !rp.handleError(ctx, err) { 793 rp.Release(ctx) 794 return nil, err 795 } 796 } 797 } 798 799 // GetXattrAt returns the value associated with the given extended attribute 800 // for the file at the given path. 801 func (vfs *VirtualFilesystem) GetXattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetXattrOptions) (string, error) { 802 rp := vfs.getResolvingPath(creds, pop) 803 for { 804 vfs.maybeBlockOnMountPromise(ctx, rp) 805 val, err := rp.mount.fs.impl.GetXattrAt(ctx, rp, *opts) 806 if err == nil { 807 rp.Release(ctx) 808 return val, nil 809 } 810 if !rp.handleError(ctx, err) { 811 rp.Release(ctx) 812 return "", err 813 } 814 } 815 } 816 817 // SetXattrAt changes the value associated with the given extended attribute 818 // for the file at the given path. 819 func (vfs *VirtualFilesystem) SetXattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *SetXattrOptions) error { 820 rp := vfs.getResolvingPath(creds, pop) 821 for { 822 vfs.maybeBlockOnMountPromise(ctx, rp) 823 err := rp.mount.fs.impl.SetXattrAt(ctx, rp, *opts) 824 if err == nil { 825 rp.Release(ctx) 826 return nil 827 } 828 if !rp.handleError(ctx, err) { 829 rp.Release(ctx) 830 return err 831 } 832 } 833 } 834 835 // RemoveXattrAt removes the given extended attribute from the file at rp. 836 func (vfs *VirtualFilesystem) RemoveXattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, name string) error { 837 rp := vfs.getResolvingPath(creds, pop) 838 for { 839 vfs.maybeBlockOnMountPromise(ctx, rp) 840 err := rp.mount.fs.impl.RemoveXattrAt(ctx, rp, name) 841 if err == nil { 842 rp.Release(ctx) 843 return nil 844 } 845 if !rp.handleError(ctx, err) { 846 rp.Release(ctx) 847 return err 848 } 849 } 850 } 851 852 // SyncAllFilesystems has the semantics of Linux's sync(2). 853 func (vfs *VirtualFilesystem) SyncAllFilesystems(ctx context.Context) error { 854 var retErr error 855 for fs := range vfs.getFilesystems() { 856 if err := fs.impl.Sync(ctx); err != nil && retErr == nil { 857 retErr = err 858 } 859 fs.DecRef(ctx) 860 } 861 return retErr 862 } 863 864 func (vfs *VirtualFilesystem) getFilesystems() map[*Filesystem]struct{} { 865 fss := make(map[*Filesystem]struct{}) 866 vfs.filesystemsMu.Lock() 867 defer vfs.filesystemsMu.Unlock() 868 for fs := range vfs.filesystems { 869 if !fs.TryIncRef() { 870 continue 871 } 872 fss[fs] = struct{}{} 873 } 874 return fss 875 } 876 877 // MkdirAllAt recursively creates non-existent directories on the given path 878 // (including the last component). 879 func (vfs *VirtualFilesystem) MkdirAllAt(ctx context.Context, currentPath string, root VirtualDentry, creds *auth.Credentials, mkdirOpts *MkdirOptions, mustBeDir bool) error { 880 pop := &PathOperation{ 881 Root: root, 882 Start: root, 883 Path: fspath.Parse(currentPath), 884 } 885 stat, err := vfs.StatAt(ctx, creds, pop, &StatOptions{Mask: linux.STATX_TYPE}) 886 switch { 887 case err == nil: 888 if mustBeDir && (stat.Mask&linux.STATX_TYPE == 0 || stat.Mode&linux.FileTypeMask != linux.ModeDirectory) { 889 return linuxerr.ENOTDIR 890 } 891 // Directory already exists. 892 return nil 893 case linuxerr.Equals(linuxerr.ENOENT, err): 894 // Expected, we will create the dir. 895 default: 896 return fmt.Errorf("stat failed for %q during directory creation: %w", currentPath, err) 897 } 898 899 // Recurse to ensure parent is created and then create the final directory. 900 if err := vfs.MkdirAllAt(ctx, path.Dir(currentPath), root, creds, mkdirOpts, true /* mustBeDir */); err != nil { 901 return err 902 } 903 if err := vfs.MkdirAt(ctx, creds, pop, mkdirOpts); err != nil { 904 return fmt.Errorf("failed to create directory %q: %w", currentPath, err) 905 } 906 return nil 907 } 908 909 // MakeSyntheticMountpoint creates parent directories of target if they do not 910 // exist and attempts to create a directory for the mountpoint. If a 911 // non-directory file already exists there then we allow it. 912 func (vfs *VirtualFilesystem) MakeSyntheticMountpoint(ctx context.Context, target string, root VirtualDentry, creds *auth.Credentials) error { 913 mkdirOpts := &MkdirOptions{Mode: 0777, ForSyntheticMountpoint: true} 914 915 // Make sure the parent directory of target exists. 916 if err := vfs.MkdirAllAt(ctx, path.Dir(target), root, creds, mkdirOpts, true /* mustBeDir */); err != nil { 917 return fmt.Errorf("failed to create parent directory of mountpoint %q: %w", target, err) 918 } 919 920 // Attempt to mkdir the final component. If a file (of any type) exists 921 // then we let allow mounting on top of that because we do not require the 922 // target to be an existing directory, unlike Linux mount(2). 923 if err := vfs.MkdirAllAt(ctx, target, root, creds, mkdirOpts, false /* mustBeDir */); err != nil { 924 return fmt.Errorf("failed to create mountpoint %q: %w", target, err) 925 } 926 return nil 927 } 928 929 func (vfs *VirtualFilesystem) getMountPromise(vd VirtualDentry) *mountPromise { 930 if mp, ok := vfs.mountPromises.Load(vd); ok { 931 return mp.(*mountPromise) 932 } 933 return nil 934 } 935 936 // RegisterMountPromise marks vd as a mount promise. This means any VFS 937 // operation on vd will be blocked until another process mounts over it or the 938 // mount promise times out. 939 func (vfs *VirtualFilesystem) RegisterMountPromise(vd VirtualDentry) error { 940 if _, loaded := vfs.mountPromises.LoadOrStore(vd, &mountPromise{wq: &waiter.Queue{}}); loaded { 941 return fmt.Errorf("mount promise already registered for %v", vd) 942 } 943 return nil 944 } 945 946 // Emit a SentryMountPromiseBlockEvent and wait for the mount promise to be 947 // resolved or time out. 948 func (vfs *VirtualFilesystem) maybeBlockOnMountPromise(ctx context.Context, rp *ResolvingPath) { 949 vd := VirtualDentry{rp.mount, rp.start} 950 mp := vfs.getMountPromise(vd) 951 if mp == nil { 952 return 953 } else if mp.resolved.Load() { 954 vfs.updateResolvingPathForMountPromise(ctx, rp) 955 return 956 } 957 958 e, ch := waiter.NewChannelEntry(waiter.EventOut) 959 mp.wq.EventRegister(&e) 960 defer mp.wq.EventUnregister(&e) 961 962 var ( 963 path string 964 err error 965 ) 966 // Unblock waiter entries that were created after this mount promise was 967 // resolved by a racing thread. 968 if mp.resolved.Load() { 969 close(ch) 970 } else { 971 root := RootFromContext(ctx) 972 defer root.DecRef(ctx) 973 path, err = vfs.PathnameReachable(ctx, root, vd) 974 if err != nil { 975 panic(fmt.Sprintf("could not reach %v from root", rp.Component())) 976 } 977 if path == "" { 978 log.Warningf("Attempting to block for a mount promise on an empty path.") 979 return 980 } 981 eventchannel.Emit(&epb.SentryMountPromiseBlockEvent{Path: path}) 982 } 983 984 select { 985 case <-ch: 986 vfs.updateResolvingPathForMountPromise(ctx, rp) 987 case <-time.After(mountPromiseTimeout): 988 panic(fmt.Sprintf("mount promise for %s timed out, unable to proceed", path)) 989 } 990 } 991 992 func (vfs *VirtualFilesystem) updateResolvingPathForMountPromise(ctx context.Context, rp *ResolvingPath) { 993 newMnt := vfs.getMountAt(ctx, rp.mount, rp.start) 994 rp.mount = newMnt 995 rp.start = newMnt.root 996 rp.flags = rp.flags&^rpflagsHaveStartRef | rpflagsHaveMountRef 997 } 998 999 func (vfs *VirtualFilesystem) maybeResolveMountPromise(vd VirtualDentry) { 1000 if mp := vfs.getMountPromise(vd); mp != nil { 1001 mp.resolved.Store(true) 1002 mp.wq.Notify(waiter.EventOut) 1003 } 1004 } 1005 1006 // PopDelayedDecRefs transfers the ownership of vfs.toDecRef to the caller via 1007 // the returned list. It is the caller's responsibility to DecRef these object 1008 // later. They must be DecRef'd outside of mountMu. 1009 // 1010 // +checklocks:vfs.mountMu 1011 func (vfs *VirtualFilesystem) PopDelayedDecRefs() []refs.RefCounter { 1012 var rcs []refs.RefCounter 1013 for rc, refs := range vfs.toDecRef { 1014 for i := 0; i < refs; i++ { 1015 rcs = append(rcs, rc) 1016 } 1017 } 1018 clear(vfs.toDecRef) 1019 return rcs 1020 } 1021 1022 // delayDecRef saves a reference counted object so that it can be DecRef'd 1023 // outside of vfs.mountMu. This is necessary because filesystem locks possibly 1024 // taken by DentryImpl.DecRef() may precede vfs.mountMu in the lock order, and 1025 // Mount.DecRef() may lock vfs.mountMu. 1026 // 1027 // +checklocks:vfs.mountMu 1028 func (vfs *VirtualFilesystem) delayDecRef(rc refs.RefCounter) { 1029 vfs.toDecRef[rc]++ 1030 } 1031 1032 // Use this instead of vfs.mountMu.Lock(). 1033 // 1034 // +checklocksacquire:vfs.mountMu 1035 func (vfs *VirtualFilesystem) lockMounts() { 1036 vfs.mountMu.Lock() 1037 } 1038 1039 // Use this instead of vfs.mountMu.Unlock(). This method DecRefs any reference 1040 // counted objects that were collected while mountMu was held. 1041 // 1042 // +checklocksrelease:vfs.mountMu 1043 func (vfs *VirtualFilesystem) unlockMounts(ctx context.Context) { 1044 if len(vfs.toDecRef) == 0 { 1045 vfs.mountMu.Unlock() 1046 return 1047 } 1048 toDecRef := vfs.toDecRef 1049 // Can't use `clear` here as this would reference the same map as `toDecRef`. 1050 vfs.toDecRef = map[refs.RefCounter]int{} 1051 vfs.mountMu.Unlock() 1052 for rc, refs := range toDecRef { 1053 for i := 0; i < refs; i++ { 1054 rc.DecRef(ctx) 1055 } 1056 } 1057 } 1058 1059 // A VirtualDentry represents a node in a VFS tree, by combining a Dentry 1060 // (which represents a node in a Filesystem's tree) and a Mount (which 1061 // represents the Filesystem's position in a VFS mount tree). 1062 // 1063 // VirtualDentry's semantics are similar to that of a Go interface object 1064 // representing a pointer: it is a copyable value type that represents 1065 // references to another entity. The zero value of VirtualDentry is an "empty 1066 // VirtualDentry", directly analogous to a nil interface object. 1067 // VirtualDentry.Ok() checks that a VirtualDentry is not zero-valued; unless 1068 // otherwise specified, all other VirtualDentry methods require 1069 // VirtualDentry.Ok() == true. 1070 // 1071 // Mounts and Dentries are reference-counted, requiring that users call 1072 // VirtualDentry.{Inc,Dec}Ref() as appropriate. We often colloquially refer to 1073 // references on the Mount and Dentry referred to by a VirtualDentry as 1074 // references on the VirtualDentry itself. Unless otherwise specified, all 1075 // VirtualDentry methods require that a reference is held on the VirtualDentry. 1076 // 1077 // VirtualDentry is analogous to Linux's struct path. 1078 // 1079 // +stateify savable 1080 type VirtualDentry struct { 1081 mount *Mount 1082 dentry *Dentry 1083 } 1084 1085 // MakeVirtualDentry creates a VirtualDentry. 1086 func MakeVirtualDentry(mount *Mount, dentry *Dentry) VirtualDentry { 1087 return VirtualDentry{ 1088 mount: mount, 1089 dentry: dentry, 1090 } 1091 } 1092 1093 // Ok returns true if vd is not empty. It does not require that a reference is 1094 // held. 1095 func (vd VirtualDentry) Ok() bool { 1096 return vd.mount != nil 1097 } 1098 1099 // IncRef increments the reference counts on the Mount and Dentry represented 1100 // by vd. 1101 func (vd VirtualDentry) IncRef() { 1102 vd.mount.IncRef() 1103 vd.dentry.IncRef() 1104 } 1105 1106 // DecRef decrements the reference counts on the Mount and Dentry represented 1107 // by vd. 1108 func (vd VirtualDentry) DecRef(ctx context.Context) { 1109 vd.dentry.DecRef(ctx) 1110 vd.mount.DecRef(ctx) 1111 } 1112 1113 // Mount returns the Mount associated with vd. It does not take a reference on 1114 // the returned Mount. 1115 func (vd VirtualDentry) Mount() *Mount { 1116 return vd.mount 1117 } 1118 1119 // Dentry returns the Dentry associated with vd. It does not take a reference 1120 // on the returned Dentry. 1121 func (vd VirtualDentry) Dentry() *Dentry { 1122 return vd.dentry 1123 }