github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/sentry/vfs/vfs.go (about) 1 // Copyright 2019 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package vfs implements a virtual filesystem layer. 16 // 17 // Lock order: 18 // 19 // EpollInstance.interestMu 20 // FileDescription.epollMu 21 // Locks acquired by FilesystemImpl/FileDescriptionImpl methods 22 // VirtualFilesystem.mountMu 23 // Dentry.mu 24 // Locks acquired by FilesystemImpls between Prepare{Delete,Rename}Dentry and Commit{Delete,Rename*}Dentry 25 // VirtualFilesystem.filesystemsMu 26 // fdnotifier.notifier.mu 27 // EpollInstance.readyMu 28 // Inotify.mu 29 // Watches.mu 30 // Inotify.evMu 31 // VirtualFilesystem.fsTypesMu 32 // 33 // Locking Dentry.mu in multiple Dentries requires holding 34 // VirtualFilesystem.mountMu. Locking EpollInstance.interestMu in multiple 35 // EpollInstances requires holding epollCycleMu. 36 package vfs 37 38 import ( 39 "fmt" 40 "path" 41 "time" 42 43 "github.com/MerlinKodo/gvisor/pkg/abi/linux" 44 "github.com/MerlinKodo/gvisor/pkg/atomicbitops" 45 "github.com/MerlinKodo/gvisor/pkg/bitmap" 46 "github.com/MerlinKodo/gvisor/pkg/context" 47 "github.com/MerlinKodo/gvisor/pkg/errors/linuxerr" 48 "github.com/MerlinKodo/gvisor/pkg/eventchannel" 49 "github.com/MerlinKodo/gvisor/pkg/fspath" 50 "github.com/MerlinKodo/gvisor/pkg/refs" 51 "github.com/MerlinKodo/gvisor/pkg/sentry/fsmetric" 52 "github.com/MerlinKodo/gvisor/pkg/sentry/kernel/auth" 53 "github.com/MerlinKodo/gvisor/pkg/sentry/socket/unix/transport" 54 epb "github.com/MerlinKodo/gvisor/pkg/sentry/vfs/events_go_proto" 55 "github.com/MerlinKodo/gvisor/pkg/sync" 56 "github.com/MerlinKodo/gvisor/pkg/waiter" 57 ) 58 59 // How long to wait for a mount promise before proceeding with the VFS 60 // operation. This should be configurable by the user eventually. 61 const mountPromiseTimeout = 10 * time.Second 62 63 // A VirtualFilesystem (VFS for short) combines Filesystems in trees of Mounts. 64 // 65 // There is no analogue to the VirtualFilesystem type in Linux, as the 66 // equivalent state in Linux is global. 67 // 68 // +stateify savable 69 type VirtualFilesystem struct { 70 // mountMu serializes mount mutations. 71 // 72 // mountMu is analogous to Linux's namespace_sem. 73 mountMu virtualFilesystemMutex `state:"nosave"` 74 75 // mounts maps (mount parent, mount point) pairs to mounts. (Since mounts 76 // are uniquely namespaced, including mount parent in the key correctly 77 // handles both bind mounts and mount namespaces; Linux does the same.) 78 // Synchronization between mutators and readers is provided by mounts.seq; 79 // synchronization between mutators is provided by mountMu. 80 // 81 // mounts is used to follow mount points during path traversal. We use a 82 // single table rather than per-Dentry tables to reduce size (and therefore 83 // cache footprint) for the vast majority of Dentries that are not mount 84 // points. 85 // 86 // mounts is analogous to Linux's mount_hashtable. 87 mounts mountTable `state:".([]*Mount)"` 88 89 // mountpoints maps mount points to mounts at those points in all 90 // namespaces. mountpoints is protected by mountMu. 91 // 92 // mountpoints is used to find mounts that must be umounted due to 93 // removal of a mount point Dentry from another mount namespace. ("A file 94 // or directory that is a mount point in one namespace that is not a mount 95 // point in another namespace, may be renamed, unlinked, or removed 96 // (rmdir(2)) in the mount namespace in which it is not a mount point 97 // (subject to the usual permission checks)." - mount_namespaces(7)) 98 // 99 // mountpoints is analogous to Linux's mountpoint_hashtable. 100 mountpoints map[*Dentry]map[*Mount]struct{} 101 102 // lastMountID is the last allocated mount ID. lastMountID is accessed 103 // using atomic memory operations. 104 lastMountID atomicbitops.Uint64 105 106 // anonMount is a Mount, not included in mounts or mountpoints, 107 // representing an anonFilesystem. anonMount is used to back 108 // VirtualDentries returned by VirtualFilesystem.NewAnonVirtualDentry(). 109 // anonMount is immutable. 110 // 111 // anonMount is analogous to Linux's anon_inode_mnt. 112 anonMount *Mount 113 114 // devices contains all registered Devices. devices is protected by 115 // devicesMu. 116 devicesMu sync.RWMutex `state:"nosave"` 117 devices map[devTuple]*registeredDevice 118 119 // dynCharDevMajorUsed contains all allocated dynamic character device 120 // major numbers. dynCharDevMajor is protected by dynCharDevMajorMu. 121 dynCharDevMajorMu sync.Mutex `state:"nosave"` 122 dynCharDevMajorUsed map[uint32]struct{} 123 124 // anonBlockDevMinor contains all allocated anonymous block device minor 125 // numbers. anonBlockDevMinorNext is a lower bound for the smallest 126 // unallocated anonymous block device number. anonBlockDevMinorNext and 127 // anonBlockDevMinor are protected by anonBlockDevMinorMu. 128 anonBlockDevMinorMu sync.Mutex `state:"nosave"` 129 anonBlockDevMinorNext uint32 130 anonBlockDevMinor map[uint32]struct{} 131 132 // fsTypes contains all registered FilesystemTypes. fsTypes is protected by 133 // fsTypesMu. 134 fsTypesMu sync.RWMutex `state:"nosave"` 135 fsTypes map[string]*registeredFilesystemType 136 137 // filesystems contains all Filesystems. filesystems is protected by 138 // filesystemsMu. 139 filesystemsMu sync.Mutex `state:"nosave"` 140 filesystems map[*Filesystem]struct{} 141 142 // groupIDBitmap tracks which mount group IDs are available for allocation. 143 groupIDBitmap bitmap.Bitmap 144 145 // mountPromises contains all unresolved mount promises. 146 mountPromisesMu sync.RWMutex `state:"nosave"` 147 mountPromises map[VirtualDentry]*waiter.Queue 148 149 // toDecRef contains all the reference counted objects that needed to be 150 // DecRefd while mountMu was held. It is cleared every time unlockMounts is 151 // called and protected by mountMu. 152 // 153 // +checklocks:mountMu 154 toDecRef map[refs.RefCounter]int 155 } 156 157 // Init initializes a new VirtualFilesystem with no mounts or FilesystemTypes. 158 func (vfs *VirtualFilesystem) Init(ctx context.Context) error { 159 if vfs.mountpoints != nil { 160 panic("VFS already initialized") 161 } 162 vfs.mountpoints = make(map[*Dentry]map[*Mount]struct{}) 163 vfs.devices = make(map[devTuple]*registeredDevice) 164 vfs.dynCharDevMajorUsed = make(map[uint32]struct{}) 165 vfs.anonBlockDevMinorNext = 1 166 vfs.anonBlockDevMinor = make(map[uint32]struct{}) 167 vfs.fsTypes = make(map[string]*registeredFilesystemType) 168 vfs.filesystems = make(map[*Filesystem]struct{}) 169 vfs.mounts.Init() 170 vfs.groupIDBitmap = bitmap.New(1024) 171 vfs.mountPromises = make(map[VirtualDentry]*waiter.Queue) 172 vfs.mountMu.Lock() 173 vfs.toDecRef = make(map[refs.RefCounter]int) 174 vfs.mountMu.Unlock() 175 176 // Construct vfs.anonMount. 177 anonfsDevMinor, err := vfs.GetAnonBlockDevMinor() 178 if err != nil { 179 // This shouldn't be possible since anonBlockDevMinorNext was 180 // initialized to 1 above (no device numbers have been allocated yet). 181 panic(fmt.Sprintf("VirtualFilesystem.Init: device number allocation for anonfs failed: %v", err)) 182 } 183 anonfs := anonFilesystem{ 184 devMinor: anonfsDevMinor, 185 } 186 anonfs.vfsfs.Init(vfs, &anonFilesystemType{}, &anonfs) 187 defer anonfs.vfsfs.DecRef(ctx) 188 anonMount := vfs.NewDisconnectedMount(&anonfs.vfsfs, nil, &MountOptions{}) 189 vfs.anonMount = anonMount 190 191 return nil 192 } 193 194 // Release drops references on filesystem objects held by vfs. 195 // 196 // Precondition: This must be called after VFS.Init() has succeeded. 197 func (vfs *VirtualFilesystem) Release(ctx context.Context) { 198 vfs.anonMount.DecRef(ctx) 199 for _, fst := range vfs.fsTypes { 200 fst.fsType.Release(ctx) 201 } 202 } 203 204 // PathOperation specifies the path operated on by a VFS method. 205 // 206 // PathOperation is passed to VFS methods by pointer to reduce memory copying: 207 // it's somewhat large and should never escape. (Options structs are passed by 208 // pointer to VFS and FileDescription methods for the same reason.) 209 // 210 // +stateify savable 211 type PathOperation struct { 212 // Root is the VFS root. References on Root are borrowed from the provider 213 // of the PathOperation. 214 // 215 // Invariants: Root.Ok(). 216 Root VirtualDentry 217 218 // Start is the starting point for the path traversal. References on Start 219 // are borrowed from the provider of the PathOperation (i.e. the caller of 220 // the VFS method to which the PathOperation was passed). 221 // 222 // Invariants: Start.Ok(). If Path.Absolute, then Start == Root. 223 Start VirtualDentry 224 225 // Path is the pathname traversed by this operation. 226 Path fspath.Path 227 228 // If FollowFinalSymlink is true, and the Dentry traversed by the final 229 // path component represents a symbolic link, the symbolic link should be 230 // followed. 231 FollowFinalSymlink bool 232 } 233 234 // AccessAt checks whether a user with creds has access to the file at 235 // the given path. 236 func (vfs *VirtualFilesystem) AccessAt(ctx context.Context, creds *auth.Credentials, ats AccessTypes, pop *PathOperation) error { 237 rp := vfs.getResolvingPath(creds, pop) 238 for { 239 vfs.maybeBlockOnMountPromise(ctx, rp) 240 err := rp.mount.fs.impl.AccessAt(ctx, rp, creds, ats) 241 if err == nil { 242 rp.Release(ctx) 243 return nil 244 } 245 if !rp.handleError(ctx, err) { 246 rp.Release(ctx) 247 return err 248 } 249 } 250 } 251 252 // GetDentryAt returns a VirtualDentry representing the given path, at which a 253 // file must exist. A reference is taken on the returned VirtualDentry. 254 func (vfs *VirtualFilesystem) GetDentryAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetDentryOptions) (VirtualDentry, error) { 255 rp := vfs.getResolvingPath(creds, pop) 256 for { 257 vfs.maybeBlockOnMountPromise(ctx, rp) 258 d, err := rp.mount.fs.impl.GetDentryAt(ctx, rp, *opts) 259 if err == nil { 260 vd := VirtualDentry{ 261 mount: rp.mount, 262 dentry: d, 263 } 264 rp.mount.IncRef() 265 rp.Release(ctx) 266 return vd, nil 267 } 268 if !rp.handleError(ctx, err) { 269 rp.Release(ctx) 270 return VirtualDentry{}, err 271 } 272 } 273 } 274 275 // Preconditions: pop.Path.Begin.Ok(). 276 func (vfs *VirtualFilesystem) getParentDirAndName(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (VirtualDentry, string, error) { 277 rp := vfs.getResolvingPath(creds, pop) 278 for { 279 vfs.maybeBlockOnMountPromise(ctx, rp) 280 parent, err := rp.mount.fs.impl.GetParentDentryAt(ctx, rp) 281 if err == nil { 282 parentVD := VirtualDentry{ 283 mount: rp.mount, 284 dentry: parent, 285 } 286 rp.mount.IncRef() 287 name := rp.Component() 288 rp.Release(ctx) 289 return parentVD, name, nil 290 } 291 if checkInvariants { 292 if rp.canHandleError(err) && rp.Done() { 293 panic(fmt.Sprintf("%T.GetParentDentryAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) 294 } 295 } 296 if !rp.handleError(ctx, err) { 297 rp.Release(ctx) 298 return VirtualDentry{}, "", err 299 } 300 } 301 } 302 303 // LinkAt creates a hard link at newpop representing the existing file at 304 // oldpop. 305 func (vfs *VirtualFilesystem) LinkAt(ctx context.Context, creds *auth.Credentials, oldpop, newpop *PathOperation) error { 306 oldVD, err := vfs.GetDentryAt(ctx, creds, oldpop, &GetDentryOptions{}) 307 if err != nil { 308 return err 309 } 310 311 if !newpop.Path.Begin.Ok() { 312 oldVD.DecRef(ctx) 313 if newpop.Path.Absolute { 314 return linuxerr.EEXIST 315 } 316 return linuxerr.ENOENT 317 } 318 if newpop.FollowFinalSymlink { 319 oldVD.DecRef(ctx) 320 ctx.Warningf("VirtualFilesystem.LinkAt: file creation paths can't follow final symlink") 321 return linuxerr.EINVAL 322 } 323 324 rp := vfs.getResolvingPath(creds, newpop) 325 for { 326 vfs.maybeBlockOnMountPromise(ctx, rp) 327 err := rp.mount.fs.impl.LinkAt(ctx, rp, oldVD) 328 if err == nil { 329 rp.Release(ctx) 330 oldVD.DecRef(ctx) 331 return nil 332 } 333 if checkInvariants { 334 if rp.canHandleError(err) && rp.Done() { 335 panic(fmt.Sprintf("%T.LinkAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) 336 } 337 } 338 if !rp.handleError(ctx, err) { 339 rp.Release(ctx) 340 oldVD.DecRef(ctx) 341 return err 342 } 343 } 344 } 345 346 // MkdirAt creates a directory at the given path. 347 func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MkdirOptions) error { 348 if !pop.Path.Begin.Ok() { 349 // pop.Path should not be empty in operations that create/delete files. 350 // This is consistent with mkdirat(dirfd, "", mode). 351 if pop.Path.Absolute { 352 return linuxerr.EEXIST 353 } 354 return linuxerr.ENOENT 355 } 356 if pop.FollowFinalSymlink { 357 ctx.Warningf("VirtualFilesystem.MkdirAt: file creation paths can't follow final symlink") 358 return linuxerr.EINVAL 359 } 360 // "Under Linux, apart from the permission bits, the S_ISVTX mode bit is 361 // also honored." - mkdir(2) 362 opts.Mode &= 0777 | linux.S_ISVTX 363 364 rp := vfs.getResolvingPath(creds, pop) 365 for { 366 vfs.maybeBlockOnMountPromise(ctx, rp) 367 err := rp.mount.fs.impl.MkdirAt(ctx, rp, *opts) 368 if err == nil { 369 rp.Release(ctx) 370 return nil 371 } 372 if checkInvariants { 373 if rp.canHandleError(err) && rp.Done() { 374 panic(fmt.Sprintf("%T.MkdirAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) 375 } 376 } 377 if !rp.handleError(ctx, err) { 378 rp.Release(ctx) 379 return err 380 } 381 } 382 } 383 384 // MknodAt creates a file of the given mode at the given path. It returns an 385 // error from the linuxerr package. 386 func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MknodOptions) error { 387 if !pop.Path.Begin.Ok() { 388 // pop.Path should not be empty in operations that create/delete files. 389 // This is consistent with mknodat(dirfd, "", mode, dev). 390 if pop.Path.Absolute { 391 return linuxerr.EEXIST 392 } 393 return linuxerr.ENOENT 394 } 395 if pop.FollowFinalSymlink { 396 ctx.Warningf("VirtualFilesystem.MknodAt: file creation paths can't follow final symlink") 397 return linuxerr.EINVAL 398 } 399 400 rp := vfs.getResolvingPath(creds, pop) 401 for { 402 vfs.maybeBlockOnMountPromise(ctx, rp) 403 err := rp.mount.fs.impl.MknodAt(ctx, rp, *opts) 404 if err == nil { 405 rp.Release(ctx) 406 return nil 407 } 408 if checkInvariants { 409 if rp.canHandleError(err) && rp.Done() { 410 panic(fmt.Sprintf("%T.MknodAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) 411 } 412 } 413 if !rp.handleError(ctx, err) { 414 rp.Release(ctx) 415 return err 416 } 417 } 418 } 419 420 // OpenAt returns a FileDescription providing access to the file at the given 421 // path. A reference is taken on the returned FileDescription. 422 func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *OpenOptions) (*FileDescription, error) { 423 fsmetric.Opens.Increment() 424 425 // Remove: 426 // 427 // - O_CLOEXEC, which affects file descriptors and therefore must be 428 // handled outside of VFS. 429 // 430 // - Unknown flags. 431 opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC | linux.O_APPEND | linux.O_NONBLOCK | linux.O_DSYNC | linux.O_ASYNC | linux.O_DIRECT | linux.O_LARGEFILE | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NOATIME | linux.O_SYNC | linux.O_PATH | linux.O_TMPFILE 432 // Linux's __O_SYNC (which we call linux.O_SYNC) implies O_DSYNC. 433 if opts.Flags&linux.O_SYNC != 0 { 434 opts.Flags |= linux.O_DSYNC 435 } 436 // Linux's __O_TMPFILE (which we call linux.O_TMPFILE) must be specified 437 // with O_DIRECTORY and a writable access mode (to ensure that it fails on 438 // filesystem implementations that do not support it). 439 if opts.Flags&linux.O_TMPFILE != 0 { 440 if opts.Flags&linux.O_DIRECTORY == 0 { 441 return nil, linuxerr.EINVAL 442 } 443 if opts.Flags&linux.O_CREAT != 0 { 444 return nil, linuxerr.EINVAL 445 } 446 if opts.Flags&linux.O_ACCMODE == linux.O_RDONLY { 447 return nil, linuxerr.EINVAL 448 } 449 } 450 // O_PATH causes most other flags to be ignored. 451 if opts.Flags&linux.O_PATH != 0 { 452 opts.Flags &= linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_PATH 453 } 454 // "On Linux, the following bits are also honored in mode: [S_ISUID, 455 // S_ISGID, S_ISVTX]" - open(2) 456 opts.Mode &= 0777 | linux.S_ISUID | linux.S_ISGID | linux.S_ISVTX 457 458 if opts.Flags&linux.O_NOFOLLOW != 0 { 459 pop.FollowFinalSymlink = false 460 } 461 if opts.Flags&linux.O_PATH != 0 { 462 return vfs.openOPathFD(ctx, creds, pop, opts.Flags) 463 } 464 rp := vfs.getResolvingPath(creds, pop) 465 if opts.Flags&linux.O_DIRECTORY != 0 { 466 rp.mustBeDir = true 467 } 468 for { 469 vfs.maybeBlockOnMountPromise(ctx, rp) 470 fd, err := rp.mount.fs.impl.OpenAt(ctx, rp, *opts) 471 if err == nil { 472 rp.Release(ctx) 473 474 if opts.FileExec { 475 if fd.Mount().Flags.NoExec { 476 fd.DecRef(ctx) 477 return nil, linuxerr.EACCES 478 } 479 480 // Only a regular file can be executed. 481 stat, err := fd.Stat(ctx, StatOptions{Mask: linux.STATX_TYPE}) 482 if err != nil { 483 fd.DecRef(ctx) 484 return nil, err 485 } 486 if stat.Mask&linux.STATX_TYPE == 0 || stat.Mode&linux.S_IFMT != linux.S_IFREG { 487 fd.DecRef(ctx) 488 return nil, linuxerr.EACCES 489 } 490 } 491 492 fd.Dentry().InotifyWithParent(ctx, linux.IN_OPEN, 0, PathEvent) 493 return fd, nil 494 } 495 if !rp.handleError(ctx, err) { 496 rp.Release(ctx) 497 return nil, err 498 } 499 } 500 } 501 502 // ReadlinkAt returns the target of the symbolic link at the given path. 503 func (vfs *VirtualFilesystem) ReadlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (string, error) { 504 rp := vfs.getResolvingPath(creds, pop) 505 for { 506 vfs.maybeBlockOnMountPromise(ctx, rp) 507 target, err := rp.mount.fs.impl.ReadlinkAt(ctx, rp) 508 if err == nil { 509 rp.Release(ctx) 510 return target, nil 511 } 512 if !rp.handleError(ctx, err) { 513 rp.Release(ctx) 514 return "", err 515 } 516 } 517 } 518 519 // RenameAt renames the file at oldpop to newpop. 520 func (vfs *VirtualFilesystem) RenameAt(ctx context.Context, creds *auth.Credentials, oldpop, newpop *PathOperation, opts *RenameOptions) error { 521 if !oldpop.Path.Begin.Ok() { 522 if oldpop.Path.Absolute { 523 return linuxerr.EBUSY 524 } 525 return linuxerr.ENOENT 526 } 527 if oldpop.FollowFinalSymlink { 528 ctx.Warningf("VirtualFilesystem.RenameAt: source path can't follow final symlink") 529 return linuxerr.EINVAL 530 } 531 532 oldParentVD, oldName, err := vfs.getParentDirAndName(ctx, creds, oldpop) 533 if err != nil { 534 return err 535 } 536 if oldName == "." || oldName == ".." { 537 oldParentVD.DecRef(ctx) 538 return linuxerr.EBUSY 539 } 540 if len(oldName) > linux.NAME_MAX { 541 oldParentVD.DecRef(ctx) 542 return linuxerr.ENAMETOOLONG 543 } 544 545 if !newpop.Path.Begin.Ok() { 546 oldParentVD.DecRef(ctx) 547 if newpop.Path.Absolute { 548 return linuxerr.EBUSY 549 } 550 return linuxerr.ENOENT 551 } 552 if newpop.FollowFinalSymlink { 553 oldParentVD.DecRef(ctx) 554 ctx.Warningf("VirtualFilesystem.RenameAt: destination path can't follow final symlink") 555 return linuxerr.EINVAL 556 } 557 558 rp := vfs.getResolvingPath(creds, newpop) 559 renameOpts := *opts 560 if oldpop.Path.Dir { 561 renameOpts.MustBeDir = true 562 } 563 for { 564 vfs.maybeBlockOnMountPromise(ctx, rp) 565 err := rp.mount.fs.impl.RenameAt(ctx, rp, oldParentVD, oldName, renameOpts) 566 if err == nil { 567 rp.Release(ctx) 568 oldParentVD.DecRef(ctx) 569 return nil 570 } 571 if checkInvariants { 572 if rp.canHandleError(err) && rp.Done() { 573 panic(fmt.Sprintf("%T.RenameAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) 574 } 575 } 576 if !rp.handleError(ctx, err) { 577 rp.Release(ctx) 578 oldParentVD.DecRef(ctx) 579 return err 580 } 581 } 582 } 583 584 // RmdirAt removes the directory at the given path. 585 func (vfs *VirtualFilesystem) RmdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) error { 586 if !pop.Path.Begin.Ok() { 587 // pop.Path should not be empty in operations that create/delete files. 588 // This is consistent with unlinkat(dirfd, "", AT_REMOVEDIR). 589 if pop.Path.Absolute { 590 return linuxerr.EBUSY 591 } 592 return linuxerr.ENOENT 593 } 594 if pop.FollowFinalSymlink { 595 ctx.Warningf("VirtualFilesystem.RmdirAt: file deletion paths can't follow final symlink") 596 return linuxerr.EINVAL 597 } 598 599 rp := vfs.getResolvingPath(creds, pop) 600 for { 601 vfs.maybeBlockOnMountPromise(ctx, rp) 602 err := rp.mount.fs.impl.RmdirAt(ctx, rp) 603 if err == nil { 604 rp.Release(ctx) 605 return nil 606 } 607 if checkInvariants { 608 if rp.canHandleError(err) && rp.Done() { 609 panic(fmt.Sprintf("%T.RmdirAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) 610 } 611 } 612 if !rp.handleError(ctx, err) { 613 rp.Release(ctx) 614 return err 615 } 616 } 617 } 618 619 // SetStatAt changes metadata for the file at the given path. 620 func (vfs *VirtualFilesystem) SetStatAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *SetStatOptions) error { 621 rp := vfs.getResolvingPath(creds, pop) 622 for { 623 vfs.maybeBlockOnMountPromise(ctx, rp) 624 err := rp.mount.fs.impl.SetStatAt(ctx, rp, *opts) 625 if err == nil { 626 rp.Release(ctx) 627 return nil 628 } 629 if !rp.handleError(ctx, err) { 630 rp.Release(ctx) 631 return err 632 } 633 } 634 } 635 636 // StatAt returns metadata for the file at the given path. 637 func (vfs *VirtualFilesystem) StatAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *StatOptions) (linux.Statx, error) { 638 rp := vfs.getResolvingPath(creds, pop) 639 for { 640 vfs.maybeBlockOnMountPromise(ctx, rp) 641 stat, err := rp.mount.fs.impl.StatAt(ctx, rp, *opts) 642 if err == nil { 643 rp.Release(ctx) 644 return stat, nil 645 } 646 if !rp.handleError(ctx, err) { 647 rp.Release(ctx) 648 return linux.Statx{}, err 649 } 650 } 651 } 652 653 // StatFSAt returns metadata for the filesystem containing the file at the 654 // given path. 655 func (vfs *VirtualFilesystem) StatFSAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (linux.Statfs, error) { 656 rp := vfs.getResolvingPath(creds, pop) 657 for { 658 vfs.maybeBlockOnMountPromise(ctx, rp) 659 statfs, err := rp.mount.fs.impl.StatFSAt(ctx, rp) 660 if err == nil { 661 rp.Release(ctx) 662 return statfs, nil 663 } 664 if !rp.handleError(ctx, err) { 665 rp.Release(ctx) 666 return linux.Statfs{}, err 667 } 668 } 669 } 670 671 // SymlinkAt creates a symbolic link at the given path with the given target. 672 func (vfs *VirtualFilesystem) SymlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, target string) error { 673 if !pop.Path.Begin.Ok() { 674 // pop.Path should not be empty in operations that create/delete files. 675 // This is consistent with symlinkat(oldpath, newdirfd, ""). 676 if pop.Path.Absolute { 677 return linuxerr.EEXIST 678 } 679 return linuxerr.ENOENT 680 } 681 if pop.FollowFinalSymlink { 682 ctx.Warningf("VirtualFilesystem.SymlinkAt: file creation paths can't follow final symlink") 683 return linuxerr.EINVAL 684 } 685 686 rp := vfs.getResolvingPath(creds, pop) 687 for { 688 vfs.maybeBlockOnMountPromise(ctx, rp) 689 err := rp.mount.fs.impl.SymlinkAt(ctx, rp, target) 690 if err == nil { 691 rp.Release(ctx) 692 return nil 693 } 694 if checkInvariants { 695 if rp.canHandleError(err) && rp.Done() { 696 panic(fmt.Sprintf("%T.SymlinkAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) 697 } 698 } 699 if !rp.handleError(ctx, err) { 700 rp.Release(ctx) 701 return err 702 } 703 } 704 } 705 706 // UnlinkAt deletes the non-directory file at the given path. 707 func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) error { 708 if !pop.Path.Begin.Ok() { 709 // pop.Path should not be empty in operations that create/delete files. 710 // This is consistent with unlinkat(dirfd, "", 0). 711 if pop.Path.Absolute { 712 return linuxerr.EBUSY 713 } 714 return linuxerr.ENOENT 715 } 716 if pop.FollowFinalSymlink { 717 ctx.Warningf("VirtualFilesystem.UnlinkAt: file deletion paths can't follow final symlink") 718 return linuxerr.EINVAL 719 } 720 721 rp := vfs.getResolvingPath(creds, pop) 722 for { 723 vfs.maybeBlockOnMountPromise(ctx, rp) 724 err := rp.mount.fs.impl.UnlinkAt(ctx, rp) 725 if err == nil { 726 rp.Release(ctx) 727 return nil 728 } 729 if checkInvariants { 730 if rp.canHandleError(err) && rp.Done() { 731 panic(fmt.Sprintf("%T.UnlinkAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) 732 } 733 } 734 if !rp.handleError(ctx, err) { 735 rp.Release(ctx) 736 return err 737 } 738 } 739 } 740 741 // BoundEndpointAt gets the bound endpoint at the given path, if one exists. 742 func (vfs *VirtualFilesystem) BoundEndpointAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *BoundEndpointOptions) (transport.BoundEndpoint, error) { 743 rp := vfs.getResolvingPath(creds, pop) 744 for { 745 vfs.maybeBlockOnMountPromise(ctx, rp) 746 bep, err := rp.mount.fs.impl.BoundEndpointAt(ctx, rp, *opts) 747 if err == nil { 748 rp.Release(ctx) 749 return bep, nil 750 } 751 if checkInvariants { 752 if rp.canHandleError(err) && rp.Done() { 753 panic(fmt.Sprintf("%T.BoundEndpointAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) 754 } 755 } 756 if !rp.handleError(ctx, err) { 757 rp.Release(ctx) 758 return nil, err 759 } 760 } 761 } 762 763 // ListXattrAt returns all extended attribute names for the file at the given 764 // path. 765 func (vfs *VirtualFilesystem) ListXattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, size uint64) ([]string, error) { 766 rp := vfs.getResolvingPath(creds, pop) 767 for { 768 vfs.maybeBlockOnMountPromise(ctx, rp) 769 names, err := rp.mount.fs.impl.ListXattrAt(ctx, rp, size) 770 if err == nil { 771 rp.Release(ctx) 772 return names, nil 773 } 774 if linuxerr.Equals(linuxerr.EOPNOTSUPP, err) { 775 // Linux doesn't actually return EOPNOTSUPP in this case; instead, 776 // fs/xattr.c:vfs_listxattr() falls back to allowing the security 777 // subsystem to return security extended attributes, which by 778 // default don't exist. 779 rp.Release(ctx) 780 return nil, nil 781 } 782 if !rp.handleError(ctx, err) { 783 rp.Release(ctx) 784 return nil, err 785 } 786 } 787 } 788 789 // GetXattrAt returns the value associated with the given extended attribute 790 // for the file at the given path. 791 func (vfs *VirtualFilesystem) GetXattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetXattrOptions) (string, error) { 792 rp := vfs.getResolvingPath(creds, pop) 793 for { 794 vfs.maybeBlockOnMountPromise(ctx, rp) 795 val, err := rp.mount.fs.impl.GetXattrAt(ctx, rp, *opts) 796 if err == nil { 797 rp.Release(ctx) 798 return val, nil 799 } 800 if !rp.handleError(ctx, err) { 801 rp.Release(ctx) 802 return "", err 803 } 804 } 805 } 806 807 // SetXattrAt changes the value associated with the given extended attribute 808 // for the file at the given path. 809 func (vfs *VirtualFilesystem) SetXattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *SetXattrOptions) error { 810 rp := vfs.getResolvingPath(creds, pop) 811 for { 812 vfs.maybeBlockOnMountPromise(ctx, rp) 813 err := rp.mount.fs.impl.SetXattrAt(ctx, rp, *opts) 814 if err == nil { 815 rp.Release(ctx) 816 return nil 817 } 818 if !rp.handleError(ctx, err) { 819 rp.Release(ctx) 820 return err 821 } 822 } 823 } 824 825 // RemoveXattrAt removes the given extended attribute from the file at rp. 826 func (vfs *VirtualFilesystem) RemoveXattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, name string) error { 827 rp := vfs.getResolvingPath(creds, pop) 828 for { 829 vfs.maybeBlockOnMountPromise(ctx, rp) 830 err := rp.mount.fs.impl.RemoveXattrAt(ctx, rp, name) 831 if err == nil { 832 rp.Release(ctx) 833 return nil 834 } 835 if !rp.handleError(ctx, err) { 836 rp.Release(ctx) 837 return err 838 } 839 } 840 } 841 842 // SyncAllFilesystems has the semantics of Linux's sync(2). 843 func (vfs *VirtualFilesystem) SyncAllFilesystems(ctx context.Context) error { 844 var retErr error 845 for fs := range vfs.getFilesystems() { 846 if err := fs.impl.Sync(ctx); err != nil && retErr == nil { 847 retErr = err 848 } 849 fs.DecRef(ctx) 850 } 851 return retErr 852 } 853 854 func (vfs *VirtualFilesystem) getFilesystems() map[*Filesystem]struct{} { 855 fss := make(map[*Filesystem]struct{}) 856 vfs.filesystemsMu.Lock() 857 defer vfs.filesystemsMu.Unlock() 858 for fs := range vfs.filesystems { 859 if !fs.TryIncRef() { 860 continue 861 } 862 fss[fs] = struct{}{} 863 } 864 return fss 865 } 866 867 // MkdirAllAt recursively creates non-existent directories on the given path 868 // (including the last component). 869 func (vfs *VirtualFilesystem) MkdirAllAt(ctx context.Context, currentPath string, root VirtualDentry, creds *auth.Credentials, mkdirOpts *MkdirOptions, mustBeDir bool) error { 870 pop := &PathOperation{ 871 Root: root, 872 Start: root, 873 Path: fspath.Parse(currentPath), 874 } 875 stat, err := vfs.StatAt(ctx, creds, pop, &StatOptions{Mask: linux.STATX_TYPE}) 876 switch { 877 case err == nil: 878 if mustBeDir && (stat.Mask&linux.STATX_TYPE == 0 || stat.Mode&linux.FileTypeMask != linux.ModeDirectory) { 879 return linuxerr.ENOTDIR 880 } 881 // Directory already exists. 882 return nil 883 case linuxerr.Equals(linuxerr.ENOENT, err): 884 // Expected, we will create the dir. 885 default: 886 return fmt.Errorf("stat failed for %q during directory creation: %w", currentPath, err) 887 } 888 889 // Recurse to ensure parent is created and then create the final directory. 890 if err := vfs.MkdirAllAt(ctx, path.Dir(currentPath), root, creds, mkdirOpts, true /* mustBeDir */); err != nil { 891 return err 892 } 893 if err := vfs.MkdirAt(ctx, creds, pop, mkdirOpts); err != nil { 894 return fmt.Errorf("failed to create directory %q: %w", currentPath, err) 895 } 896 return nil 897 } 898 899 // MakeSyntheticMountpoint creates parent directories of target if they do not 900 // exist and attempts to create a directory for the mountpoint. If a 901 // non-directory file already exists there then we allow it. 902 func (vfs *VirtualFilesystem) MakeSyntheticMountpoint(ctx context.Context, target string, root VirtualDentry, creds *auth.Credentials) error { 903 mkdirOpts := &MkdirOptions{Mode: 0777, ForSyntheticMountpoint: true} 904 905 // Make sure the parent directory of target exists. 906 if err := vfs.MkdirAllAt(ctx, path.Dir(target), root, creds, mkdirOpts, true /* mustBeDir */); err != nil { 907 return fmt.Errorf("failed to create parent directory of mountpoint %q: %w", target, err) 908 } 909 910 // Attempt to mkdir the final component. If a file (of any type) exists 911 // then we let allow mounting on top of that because we do not require the 912 // target to be an existing directory, unlike Linux mount(2). 913 if err := vfs.MkdirAllAt(ctx, target, root, creds, mkdirOpts, false /* mustBeDir */); err != nil { 914 return fmt.Errorf("failed to create mountpoint %q: %w", target, err) 915 } 916 return nil 917 } 918 919 // RegisterMountPromise marks vd as a mount promise. This means any VFS 920 // operation on vd will be blocked until another process mounts over it or the 921 // mount promise times out. 922 func (vfs *VirtualFilesystem) RegisterMountPromise(vd VirtualDentry) error { 923 vfs.mountPromisesMu.Lock() 924 defer vfs.mountPromisesMu.Unlock() 925 if _, ok := vfs.mountPromises[vd]; ok { 926 return fmt.Errorf("mount promise for %v already exists", vd) 927 } 928 wq := &waiter.Queue{} 929 vfs.mountPromises[vd] = wq 930 return nil 931 } 932 933 // Emit a SentryMountPromiseBlockEvent and wait for the mount promise to be 934 // resolved or time out. 935 func (vfs *VirtualFilesystem) maybeBlockOnMountPromise(ctx context.Context, rp *ResolvingPath) { 936 vd := VirtualDentry{rp.mount, rp.start} 937 vfs.mountPromisesMu.RLock() 938 wq, ok := vfs.mountPromises[vd] 939 vfs.mountPromisesMu.RUnlock() 940 if !ok { 941 return 942 } 943 944 path, err := vfs.PathnameReachable(ctx, rp.root, vd) 945 if err != nil { 946 panic(fmt.Sprintf("could not reach %v from root", rp.Component())) 947 } 948 e, ch := waiter.NewChannelEntry(waiter.EventOut) 949 wq.EventRegister(&e) 950 eventchannel.Emit(&epb.SentryMountPromiseBlockEvent{Path: path}) 951 952 select { 953 case <-ch: 954 // Update rp to point to the promised mount. 955 newMnt := vfs.getMountAt(ctx, rp.mount, rp.start) 956 rp.mount = newMnt 957 rp.start = newMnt.root 958 rp.flags = rp.flags&^rpflagsHaveStartRef | rpflagsHaveMountRef 959 case <-time.After(mountPromiseTimeout): 960 panic(fmt.Sprintf("mount promise for %s timed out, unable to proceed", path)) 961 } 962 } 963 964 func (vfs *VirtualFilesystem) maybeResolveMountPromise(vd VirtualDentry) { 965 vfs.mountPromisesMu.Lock() 966 defer vfs.mountPromisesMu.Unlock() 967 wq, ok := vfs.mountPromises[vd] 968 if !ok { 969 return 970 } 971 wq.Notify(waiter.EventOut) 972 delete(vfs.mountPromises, vd) 973 } 974 975 // PopDelayedDecRefs transfers the ownership of vfs.toDecRef to the caller via 976 // the returned list. It is the caller's responsibility to DecRef these object 977 // later. They must be DecRef'd outside of mountMu. 978 // 979 // +checklocks:vfs.mountMu 980 func (vfs *VirtualFilesystem) PopDelayedDecRefs() []refs.RefCounter { 981 var rcs []refs.RefCounter 982 for rc, refs := range vfs.toDecRef { 983 for i := 0; i < refs; i++ { 984 rcs = append(rcs, rc) 985 } 986 } 987 vfs.toDecRef = map[refs.RefCounter]int{} 988 return rcs 989 } 990 991 // delayDecRef saves a reference counted object so that it can be DecRef'd 992 // outside of vfs.mountMu. This is necessary because filesystem locks possibly 993 // taken by DentryImpl.DecRef() may precede vfs.mountMu in the lock order, and 994 // Mount.DecRef() may lock vfs.mountMu. 995 // 996 // +checklocks:vfs.mountMu 997 func (vfs *VirtualFilesystem) delayDecRef(rc refs.RefCounter) { 998 vfs.toDecRef[rc]++ 999 } 1000 1001 // Use this instead of vfs.mountMu.Lock(). 1002 // 1003 // +checklocksacquire:vfs.mountMu 1004 func (vfs *VirtualFilesystem) lockMounts() { 1005 vfs.mountMu.Lock() 1006 } 1007 1008 // Use this instead of vfs.mountMu.Unlock(). This method DecRefs any reference 1009 // counted objects that were collected while mountMu was held. 1010 // 1011 // +checklocksrelease:vfs.mountMu 1012 func (vfs *VirtualFilesystem) unlockMounts(ctx context.Context) { 1013 if len(vfs.toDecRef) == 0 { 1014 vfs.mountMu.Unlock() 1015 return 1016 } 1017 toDecRef := vfs.toDecRef 1018 vfs.toDecRef = map[refs.RefCounter]int{} 1019 vfs.mountMu.Unlock() 1020 for rc, refs := range toDecRef { 1021 for i := 0; i < refs; i++ { 1022 rc.DecRef(ctx) 1023 } 1024 } 1025 } 1026 1027 // A VirtualDentry represents a node in a VFS tree, by combining a Dentry 1028 // (which represents a node in a Filesystem's tree) and a Mount (which 1029 // represents the Filesystem's position in a VFS mount tree). 1030 // 1031 // VirtualDentry's semantics are similar to that of a Go interface object 1032 // representing a pointer: it is a copyable value type that represents 1033 // references to another entity. The zero value of VirtualDentry is an "empty 1034 // VirtualDentry", directly analogous to a nil interface object. 1035 // VirtualDentry.Ok() checks that a VirtualDentry is not zero-valued; unless 1036 // otherwise specified, all other VirtualDentry methods require 1037 // VirtualDentry.Ok() == true. 1038 // 1039 // Mounts and Dentries are reference-counted, requiring that users call 1040 // VirtualDentry.{Inc,Dec}Ref() as appropriate. We often colloquially refer to 1041 // references on the Mount and Dentry referred to by a VirtualDentry as 1042 // references on the VirtualDentry itself. Unless otherwise specified, all 1043 // VirtualDentry methods require that a reference is held on the VirtualDentry. 1044 // 1045 // VirtualDentry is analogous to Linux's struct path. 1046 // 1047 // +stateify savable 1048 type VirtualDentry struct { 1049 mount *Mount 1050 dentry *Dentry 1051 } 1052 1053 // MakeVirtualDentry creates a VirtualDentry. 1054 func MakeVirtualDentry(mount *Mount, dentry *Dentry) VirtualDentry { 1055 return VirtualDentry{ 1056 mount: mount, 1057 dentry: dentry, 1058 } 1059 } 1060 1061 // Ok returns true if vd is not empty. It does not require that a reference is 1062 // held. 1063 func (vd VirtualDentry) Ok() bool { 1064 return vd.mount != nil 1065 } 1066 1067 // IncRef increments the reference counts on the Mount and Dentry represented 1068 // by vd. 1069 func (vd VirtualDentry) IncRef() { 1070 vd.mount.IncRef() 1071 vd.dentry.IncRef() 1072 } 1073 1074 // DecRef decrements the reference counts on the Mount and Dentry represented 1075 // by vd. 1076 func (vd VirtualDentry) DecRef(ctx context.Context) { 1077 vd.dentry.DecRef(ctx) 1078 vd.mount.DecRef(ctx) 1079 } 1080 1081 // Mount returns the Mount associated with vd. It does not take a reference on 1082 // the returned Mount. 1083 func (vd VirtualDentry) Mount() *Mount { 1084 return vd.mount 1085 } 1086 1087 // Dentry returns the Dentry associated with vd. It does not take a reference 1088 // on the returned Dentry. 1089 func (vd VirtualDentry) Dentry() *Dentry { 1090 return vd.dentry 1091 }