github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/vfs/vfs.go (about) 1 // Copyright 2019 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package vfs implements a virtual filesystem layer. 16 // 17 // Lock order: 18 // 19 // EpollInstance.interestMu 20 // FileDescription.epollMu 21 // Locks acquired by FilesystemImpl/FileDescriptionImpl methods (except IsDescendant) 22 // VirtualFilesystem.mountMu 23 // Dentry.mu 24 // Locks acquired by FilesystemImpls between Prepare{Delete,Rename}Dentry and Commit{Delete,Rename*}Dentry 25 // VirtualFilesystem.filesystemsMu 26 // fdnotifier.notifier.mu 27 // EpollInstance.readyMu 28 // Inotify.mu 29 // Watches.mu 30 // Inotify.evMu 31 // VirtualFilesystem.fsTypesMu 32 // 33 // Locking Dentry.mu in multiple Dentries requires holding 34 // VirtualFilesystem.mountMu. Locking EpollInstance.interestMu in multiple 35 // EpollInstances requires holding epollCycleMu. 36 // 37 // FilesystemImpl locks are not held during calls to FilesystemImpl.IsDescendant 38 // since it's called under mountMu. It's possible for concurrent mutation 39 // to dentry ancestors during calls IsDescendant. Callers should take 40 // appropriate caution when using this method. 41 package vfs 42 43 import ( 44 "fmt" 45 "path" 46 "time" 47 48 "github.com/metacubex/gvisor/pkg/abi/linux" 49 "github.com/metacubex/gvisor/pkg/atomicbitops" 50 "github.com/metacubex/gvisor/pkg/bitmap" 51 "github.com/metacubex/gvisor/pkg/context" 52 "github.com/metacubex/gvisor/pkg/errors/linuxerr" 53 "github.com/metacubex/gvisor/pkg/eventchannel" 54 "github.com/metacubex/gvisor/pkg/fspath" 55 "github.com/metacubex/gvisor/pkg/refs" 56 "github.com/metacubex/gvisor/pkg/sentry/fsmetric" 57 "github.com/metacubex/gvisor/pkg/sentry/kernel/auth" 58 "github.com/metacubex/gvisor/pkg/sentry/socket/unix/transport" 59 epb "github.com/metacubex/gvisor/pkg/sentry/vfs/events_go_proto" 60 "github.com/metacubex/gvisor/pkg/sync" 61 "github.com/metacubex/gvisor/pkg/waiter" 62 ) 63 64 // How long to wait for a mount promise before proceeding with the VFS 65 // operation. This should be configurable by the user eventually. 66 const mountPromiseTimeout = 30 * time.Second 67 68 // A VirtualFilesystem (VFS for short) combines Filesystems in trees of Mounts. 69 // 70 // There is no analogue to the VirtualFilesystem type in Linux, as the 71 // equivalent state in Linux is global. 72 // 73 // +stateify savable 74 type VirtualFilesystem struct { 75 // mountMu serializes mount mutations. 76 // 77 // mountMu is analogous to Linux's namespace_sem. 78 mountMu virtualFilesystemMutex `state:"nosave"` 79 80 // mounts maps (mount parent, mount point) pairs to mounts. (Since mounts 81 // are uniquely namespaced, including mount parent in the key correctly 82 // handles both bind mounts and mount namespaces; Linux does the same.) 83 // Synchronization between mutators and readers is provided by mounts.seq; 84 // synchronization between mutators is provided by mountMu. 85 // 86 // mounts is used to follow mount points during path traversal. We use a 87 // single table rather than per-Dentry tables to reduce size (and therefore 88 // cache footprint) for the vast majority of Dentries that are not mount 89 // points. 90 // 91 // mounts is analogous to Linux's mount_hashtable. 92 mounts mountTable `state:".([]*Mount)"` 93 94 // mountpoints maps mount points to mounts at those points in all 95 // namespaces. mountpoints is protected by mountMu. 96 // 97 // mountpoints is used to find mounts that must be umounted due to 98 // removal of a mount point Dentry from another mount namespace. ("A file 99 // or directory that is a mount point in one namespace that is not a mount 100 // point in another namespace, may be renamed, unlinked, or removed 101 // (rmdir(2)) in the mount namespace in which it is not a mount point 102 // (subject to the usual permission checks)." - mount_namespaces(7)) 103 // 104 // mountpoints is analogous to Linux's mountpoint_hashtable. 105 mountpoints map[*Dentry]map[*Mount]struct{} 106 107 // lastMountID is the last allocated mount ID. lastMountID is accessed 108 // using atomic memory operations. 109 lastMountID atomicbitops.Uint64 110 111 // anonMount is a Mount, not included in mounts or mountpoints, 112 // representing an anonFilesystem. anonMount is used to back 113 // VirtualDentries returned by VirtualFilesystem.NewAnonVirtualDentry(). 114 // anonMount is immutable. 115 // 116 // anonMount is analogous to Linux's anon_inode_mnt. 117 anonMount *Mount 118 119 // devices contains all registered Devices. devices is protected by 120 // devicesMu. 121 devicesMu sync.RWMutex `state:"nosave"` 122 devices map[devTuple]*registeredDevice 123 124 // dynCharDevMajorUsed contains all allocated dynamic character device 125 // major numbers. dynCharDevMajor is protected by dynCharDevMajorMu. 126 dynCharDevMajorMu sync.Mutex `state:"nosave"` 127 dynCharDevMajorUsed map[uint32]struct{} 128 129 // anonBlockDevMinor contains all allocated anonymous block device minor 130 // numbers. anonBlockDevMinorNext is a lower bound for the smallest 131 // unallocated anonymous block device number. anonBlockDevMinorNext and 132 // anonBlockDevMinor are protected by anonBlockDevMinorMu. 133 anonBlockDevMinorMu sync.Mutex `state:"nosave"` 134 anonBlockDevMinorNext uint32 135 anonBlockDevMinor map[uint32]struct{} 136 137 // fsTypes contains all registered FilesystemTypes. fsTypes is protected by 138 // fsTypesMu. 139 fsTypesMu sync.RWMutex `state:"nosave"` 140 fsTypes map[string]*registeredFilesystemType 141 142 // filesystems contains all Filesystems. filesystems is protected by 143 // filesystemsMu. 144 filesystemsMu sync.Mutex `state:"nosave"` 145 filesystems map[*Filesystem]struct{} 146 147 // groupIDBitmap tracks which mount group IDs are available for allocation. 148 groupIDBitmap bitmap.Bitmap 149 150 // mountPromises contains all unresolved mount promises. 151 mountPromisesMu sync.RWMutex `state:"nosave"` 152 mountPromises map[VirtualDentry]*waiter.Queue 153 154 // toDecRef contains all the reference counted objects that needed to be 155 // DecRefd while mountMu was held. It is cleared every time unlockMounts is 156 // called and protected by mountMu. 157 // 158 // +checklocks:mountMu 159 toDecRef map[refs.RefCounter]int 160 } 161 162 // Init initializes a new VirtualFilesystem with no mounts or FilesystemTypes. 163 func (vfs *VirtualFilesystem) Init(ctx context.Context) error { 164 if vfs.mountpoints != nil { 165 panic("VFS already initialized") 166 } 167 vfs.mountpoints = make(map[*Dentry]map[*Mount]struct{}) 168 vfs.devices = make(map[devTuple]*registeredDevice) 169 vfs.dynCharDevMajorUsed = make(map[uint32]struct{}) 170 vfs.anonBlockDevMinorNext = 1 171 vfs.anonBlockDevMinor = make(map[uint32]struct{}) 172 vfs.fsTypes = make(map[string]*registeredFilesystemType) 173 vfs.filesystems = make(map[*Filesystem]struct{}) 174 vfs.mounts.Init() 175 vfs.groupIDBitmap = bitmap.New(1024) 176 vfs.mountPromises = make(map[VirtualDentry]*waiter.Queue) 177 vfs.mountMu.Lock() 178 vfs.toDecRef = make(map[refs.RefCounter]int) 179 vfs.mountMu.Unlock() 180 181 // Construct vfs.anonMount. 182 anonfsDevMinor, err := vfs.GetAnonBlockDevMinor() 183 if err != nil { 184 // This shouldn't be possible since anonBlockDevMinorNext was 185 // initialized to 1 above (no device numbers have been allocated yet). 186 panic(fmt.Sprintf("VirtualFilesystem.Init: device number allocation for anonfs failed: %v", err)) 187 } 188 anonfs := anonFilesystem{ 189 devMinor: anonfsDevMinor, 190 } 191 anonfs.vfsfs.Init(vfs, &anonFilesystemType{}, &anonfs) 192 defer anonfs.vfsfs.DecRef(ctx) 193 anonMount := vfs.NewDisconnectedMount(&anonfs.vfsfs, nil, &MountOptions{}) 194 vfs.anonMount = anonMount 195 196 return nil 197 } 198 199 // Release drops references on filesystem objects held by vfs. 200 // 201 // Precondition: This must be called after VFS.Init() has succeeded. 202 func (vfs *VirtualFilesystem) Release(ctx context.Context) { 203 vfs.anonMount.DecRef(ctx) 204 for _, fst := range vfs.fsTypes { 205 fst.fsType.Release(ctx) 206 } 207 } 208 209 // PathOperation specifies the path operated on by a VFS method. 210 // 211 // PathOperation is passed to VFS methods by pointer to reduce memory copying: 212 // it's somewhat large and should never escape. (Options structs are passed by 213 // pointer to VFS and FileDescription methods for the same reason.) 214 // 215 // +stateify savable 216 type PathOperation struct { 217 // Root is the VFS root. References on Root are borrowed from the provider 218 // of the PathOperation. 219 // 220 // Invariants: Root.Ok(). 221 Root VirtualDentry 222 223 // Start is the starting point for the path traversal. References on Start 224 // are borrowed from the provider of the PathOperation (i.e. the caller of 225 // the VFS method to which the PathOperation was passed). 226 // 227 // Invariants: Start.Ok(). If Path.Absolute, then Start == Root. 228 Start VirtualDentry 229 230 // Path is the pathname traversed by this operation. 231 Path fspath.Path 232 233 // If FollowFinalSymlink is true, and the Dentry traversed by the final 234 // path component represents a symbolic link, the symbolic link should be 235 // followed. 236 FollowFinalSymlink bool 237 } 238 239 // AccessAt checks whether a user with creds has access to the file at 240 // the given path. 241 func (vfs *VirtualFilesystem) AccessAt(ctx context.Context, creds *auth.Credentials, ats AccessTypes, pop *PathOperation) error { 242 rp := vfs.getResolvingPath(creds, pop) 243 for { 244 vfs.maybeBlockOnMountPromise(ctx, rp) 245 err := rp.mount.fs.impl.AccessAt(ctx, rp, creds, ats) 246 if err == nil { 247 rp.Release(ctx) 248 return nil 249 } 250 if !rp.handleError(ctx, err) { 251 rp.Release(ctx) 252 return err 253 } 254 } 255 } 256 257 // GetDentryAt returns a VirtualDentry representing the given path, at which a 258 // file must exist. A reference is taken on the returned VirtualDentry. 259 func (vfs *VirtualFilesystem) GetDentryAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetDentryOptions) (VirtualDentry, error) { 260 rp := vfs.getResolvingPath(creds, pop) 261 for { 262 vfs.maybeBlockOnMountPromise(ctx, rp) 263 d, err := rp.mount.fs.impl.GetDentryAt(ctx, rp, *opts) 264 if err == nil { 265 vd := VirtualDentry{ 266 mount: rp.mount, 267 dentry: d, 268 } 269 rp.mount.IncRef() 270 rp.Release(ctx) 271 return vd, nil 272 } 273 if !rp.handleError(ctx, err) { 274 rp.Release(ctx) 275 return VirtualDentry{}, err 276 } 277 } 278 } 279 280 // Preconditions: pop.Path.Begin.Ok(). 281 func (vfs *VirtualFilesystem) getParentDirAndName(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (VirtualDentry, string, error) { 282 rp := vfs.getResolvingPath(creds, pop) 283 for { 284 vfs.maybeBlockOnMountPromise(ctx, rp) 285 parent, err := rp.mount.fs.impl.GetParentDentryAt(ctx, rp) 286 if err == nil { 287 parentVD := VirtualDentry{ 288 mount: rp.mount, 289 dentry: parent, 290 } 291 rp.mount.IncRef() 292 name := rp.Component() 293 rp.Release(ctx) 294 return parentVD, name, nil 295 } 296 if checkInvariants { 297 if rp.canHandleError(err) && rp.Done() { 298 panic(fmt.Sprintf("%T.GetParentDentryAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) 299 } 300 } 301 if !rp.handleError(ctx, err) { 302 rp.Release(ctx) 303 return VirtualDentry{}, "", err 304 } 305 } 306 } 307 308 // LinkAt creates a hard link at newpop representing the existing file at 309 // oldpop. 310 func (vfs *VirtualFilesystem) LinkAt(ctx context.Context, creds *auth.Credentials, oldpop, newpop *PathOperation) error { 311 oldVD, err := vfs.GetDentryAt(ctx, creds, oldpop, &GetDentryOptions{}) 312 if err != nil { 313 return err 314 } 315 316 if !newpop.Path.Begin.Ok() { 317 oldVD.DecRef(ctx) 318 if newpop.Path.Absolute { 319 return linuxerr.EEXIST 320 } 321 return linuxerr.ENOENT 322 } 323 if newpop.FollowFinalSymlink { 324 oldVD.DecRef(ctx) 325 ctx.Warningf("VirtualFilesystem.LinkAt: file creation paths can't follow final symlink") 326 return linuxerr.EINVAL 327 } 328 329 rp := vfs.getResolvingPath(creds, newpop) 330 for { 331 vfs.maybeBlockOnMountPromise(ctx, rp) 332 err := rp.mount.fs.impl.LinkAt(ctx, rp, oldVD) 333 if err == nil { 334 rp.Release(ctx) 335 oldVD.DecRef(ctx) 336 return nil 337 } 338 if checkInvariants { 339 if rp.canHandleError(err) && rp.Done() { 340 panic(fmt.Sprintf("%T.LinkAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) 341 } 342 } 343 if !rp.handleError(ctx, err) { 344 rp.Release(ctx) 345 oldVD.DecRef(ctx) 346 return err 347 } 348 } 349 } 350 351 // MkdirAt creates a directory at the given path. 352 func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MkdirOptions) error { 353 if !pop.Path.Begin.Ok() { 354 // pop.Path should not be empty in operations that create/delete files. 355 // This is consistent with mkdirat(dirfd, "", mode). 356 if pop.Path.Absolute { 357 return linuxerr.EEXIST 358 } 359 return linuxerr.ENOENT 360 } 361 if pop.FollowFinalSymlink { 362 ctx.Warningf("VirtualFilesystem.MkdirAt: file creation paths can't follow final symlink") 363 return linuxerr.EINVAL 364 } 365 // "Under Linux, apart from the permission bits, the S_ISVTX mode bit is 366 // also honored." - mkdir(2) 367 opts.Mode &= 0777 | linux.S_ISVTX 368 369 rp := vfs.getResolvingPath(creds, pop) 370 for { 371 vfs.maybeBlockOnMountPromise(ctx, rp) 372 err := rp.mount.fs.impl.MkdirAt(ctx, rp, *opts) 373 if err == nil { 374 rp.Release(ctx) 375 return nil 376 } 377 if checkInvariants { 378 if rp.canHandleError(err) && rp.Done() { 379 panic(fmt.Sprintf("%T.MkdirAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) 380 } 381 } 382 if !rp.handleError(ctx, err) { 383 rp.Release(ctx) 384 return err 385 } 386 } 387 } 388 389 // MknodAt creates a file of the given mode at the given path. It returns an 390 // error from the linuxerr package. 391 func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MknodOptions) error { 392 if !pop.Path.Begin.Ok() { 393 // pop.Path should not be empty in operations that create/delete files. 394 // This is consistent with mknodat(dirfd, "", mode, dev). 395 if pop.Path.Absolute { 396 return linuxerr.EEXIST 397 } 398 return linuxerr.ENOENT 399 } 400 if pop.FollowFinalSymlink { 401 ctx.Warningf("VirtualFilesystem.MknodAt: file creation paths can't follow final symlink") 402 return linuxerr.EINVAL 403 } 404 405 rp := vfs.getResolvingPath(creds, pop) 406 for { 407 vfs.maybeBlockOnMountPromise(ctx, rp) 408 err := rp.mount.fs.impl.MknodAt(ctx, rp, *opts) 409 if err == nil { 410 rp.Release(ctx) 411 return nil 412 } 413 if checkInvariants { 414 if rp.canHandleError(err) && rp.Done() { 415 panic(fmt.Sprintf("%T.MknodAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) 416 } 417 } 418 if !rp.handleError(ctx, err) { 419 rp.Release(ctx) 420 return err 421 } 422 } 423 } 424 425 // OpenAt returns a FileDescription providing access to the file at the given 426 // path. A reference is taken on the returned FileDescription. 427 func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *OpenOptions) (*FileDescription, error) { 428 fsmetric.Opens.Increment() 429 430 // Remove: 431 // 432 // - O_CLOEXEC, which affects file descriptors and therefore must be 433 // handled outside of VFS. 434 // 435 // - Unknown flags. 436 opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC | linux.O_APPEND | linux.O_NONBLOCK | linux.O_DSYNC | linux.O_ASYNC | linux.O_DIRECT | linux.O_LARGEFILE | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NOATIME | linux.O_SYNC | linux.O_PATH | linux.O_TMPFILE 437 // Linux's __O_SYNC (which we call linux.O_SYNC) implies O_DSYNC. 438 if opts.Flags&linux.O_SYNC != 0 { 439 opts.Flags |= linux.O_DSYNC 440 } 441 // Linux's __O_TMPFILE (which we call linux.O_TMPFILE) must be specified 442 // with O_DIRECTORY and a writable access mode (to ensure that it fails on 443 // filesystem implementations that do not support it). 444 if opts.Flags&linux.O_TMPFILE != 0 { 445 if opts.Flags&linux.O_DIRECTORY == 0 { 446 return nil, linuxerr.EINVAL 447 } 448 if opts.Flags&linux.O_CREAT != 0 { 449 return nil, linuxerr.EINVAL 450 } 451 if opts.Flags&linux.O_ACCMODE == linux.O_RDONLY { 452 return nil, linuxerr.EINVAL 453 } 454 } 455 // O_PATH causes most other flags to be ignored. 456 if opts.Flags&linux.O_PATH != 0 { 457 opts.Flags &= linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_PATH 458 } 459 // "On Linux, the following bits are also honored in mode: [S_ISUID, 460 // S_ISGID, S_ISVTX]" - open(2) 461 opts.Mode &= 0777 | linux.S_ISUID | linux.S_ISGID | linux.S_ISVTX 462 463 if opts.Flags&linux.O_NOFOLLOW != 0 { 464 pop.FollowFinalSymlink = false 465 } 466 if opts.Flags&linux.O_PATH != 0 { 467 return vfs.openOPathFD(ctx, creds, pop, opts.Flags) 468 } 469 rp := vfs.getResolvingPath(creds, pop) 470 if opts.Flags&linux.O_DIRECTORY != 0 { 471 rp.mustBeDir = true 472 } 473 for { 474 vfs.maybeBlockOnMountPromise(ctx, rp) 475 fd, err := rp.mount.fs.impl.OpenAt(ctx, rp, *opts) 476 if err == nil { 477 rp.Release(ctx) 478 479 if opts.FileExec { 480 if fd.Mount().Options().Flags.NoExec { 481 fd.DecRef(ctx) 482 return nil, linuxerr.EACCES 483 } 484 485 // Only a regular file can be executed. 486 stat, err := fd.Stat(ctx, StatOptions{Mask: linux.STATX_TYPE}) 487 if err != nil { 488 fd.DecRef(ctx) 489 return nil, err 490 } 491 if stat.Mask&linux.STATX_TYPE == 0 || stat.Mode&linux.S_IFMT != linux.S_IFREG { 492 fd.DecRef(ctx) 493 return nil, linuxerr.EACCES 494 } 495 } 496 497 fd.Dentry().InotifyWithParent(ctx, linux.IN_OPEN, 0, PathEvent) 498 return fd, nil 499 } 500 if !rp.handleError(ctx, err) { 501 rp.Release(ctx) 502 return nil, err 503 } 504 } 505 } 506 507 // ReadlinkAt returns the target of the symbolic link at the given path. 508 func (vfs *VirtualFilesystem) ReadlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (string, error) { 509 rp := vfs.getResolvingPath(creds, pop) 510 for { 511 vfs.maybeBlockOnMountPromise(ctx, rp) 512 target, err := rp.mount.fs.impl.ReadlinkAt(ctx, rp) 513 if err == nil { 514 rp.Release(ctx) 515 return target, nil 516 } 517 if !rp.handleError(ctx, err) { 518 rp.Release(ctx) 519 return "", err 520 } 521 } 522 } 523 524 // RenameAt renames the file at oldpop to newpop. 525 func (vfs *VirtualFilesystem) RenameAt(ctx context.Context, creds *auth.Credentials, oldpop, newpop *PathOperation, opts *RenameOptions) error { 526 if !oldpop.Path.Begin.Ok() { 527 if oldpop.Path.Absolute { 528 return linuxerr.EBUSY 529 } 530 return linuxerr.ENOENT 531 } 532 if oldpop.FollowFinalSymlink { 533 ctx.Warningf("VirtualFilesystem.RenameAt: source path can't follow final symlink") 534 return linuxerr.EINVAL 535 } 536 537 oldParentVD, oldName, err := vfs.getParentDirAndName(ctx, creds, oldpop) 538 if err != nil { 539 return err 540 } 541 if oldName == "." || oldName == ".." { 542 oldParentVD.DecRef(ctx) 543 return linuxerr.EBUSY 544 } 545 if len(oldName) > linux.NAME_MAX { 546 oldParentVD.DecRef(ctx) 547 return linuxerr.ENAMETOOLONG 548 } 549 550 if !newpop.Path.Begin.Ok() { 551 oldParentVD.DecRef(ctx) 552 if newpop.Path.Absolute { 553 return linuxerr.EBUSY 554 } 555 return linuxerr.ENOENT 556 } 557 if newpop.FollowFinalSymlink { 558 oldParentVD.DecRef(ctx) 559 ctx.Warningf("VirtualFilesystem.RenameAt: destination path can't follow final symlink") 560 return linuxerr.EINVAL 561 } 562 563 rp := vfs.getResolvingPath(creds, newpop) 564 renameOpts := *opts 565 if oldpop.Path.Dir { 566 renameOpts.MustBeDir = true 567 } 568 for { 569 vfs.maybeBlockOnMountPromise(ctx, rp) 570 err := rp.mount.fs.impl.RenameAt(ctx, rp, oldParentVD, oldName, renameOpts) 571 if err == nil { 572 rp.Release(ctx) 573 oldParentVD.DecRef(ctx) 574 return nil 575 } 576 if checkInvariants { 577 if rp.canHandleError(err) && rp.Done() { 578 panic(fmt.Sprintf("%T.RenameAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) 579 } 580 } 581 if !rp.handleError(ctx, err) { 582 rp.Release(ctx) 583 oldParentVD.DecRef(ctx) 584 return err 585 } 586 } 587 } 588 589 // RmdirAt removes the directory at the given path. 590 func (vfs *VirtualFilesystem) RmdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) error { 591 if !pop.Path.Begin.Ok() { 592 // pop.Path should not be empty in operations that create/delete files. 593 // This is consistent with unlinkat(dirfd, "", AT_REMOVEDIR). 594 if pop.Path.Absolute { 595 return linuxerr.EBUSY 596 } 597 return linuxerr.ENOENT 598 } 599 if pop.FollowFinalSymlink { 600 ctx.Warningf("VirtualFilesystem.RmdirAt: file deletion paths can't follow final symlink") 601 return linuxerr.EINVAL 602 } 603 604 rp := vfs.getResolvingPath(creds, pop) 605 for { 606 vfs.maybeBlockOnMountPromise(ctx, rp) 607 err := rp.mount.fs.impl.RmdirAt(ctx, rp) 608 if err == nil { 609 rp.Release(ctx) 610 return nil 611 } 612 if checkInvariants { 613 if rp.canHandleError(err) && rp.Done() { 614 panic(fmt.Sprintf("%T.RmdirAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) 615 } 616 } 617 if !rp.handleError(ctx, err) { 618 rp.Release(ctx) 619 return err 620 } 621 } 622 } 623 624 // SetStatAt changes metadata for the file at the given path. 625 func (vfs *VirtualFilesystem) SetStatAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *SetStatOptions) error { 626 rp := vfs.getResolvingPath(creds, pop) 627 for { 628 vfs.maybeBlockOnMountPromise(ctx, rp) 629 err := rp.mount.fs.impl.SetStatAt(ctx, rp, *opts) 630 if err == nil { 631 rp.Release(ctx) 632 return nil 633 } 634 if !rp.handleError(ctx, err) { 635 rp.Release(ctx) 636 return err 637 } 638 } 639 } 640 641 // StatAt returns metadata for the file at the given path. 642 func (vfs *VirtualFilesystem) StatAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *StatOptions) (linux.Statx, error) { 643 rp := vfs.getResolvingPath(creds, pop) 644 for { 645 vfs.maybeBlockOnMountPromise(ctx, rp) 646 stat, err := rp.mount.fs.impl.StatAt(ctx, rp, *opts) 647 if err == nil { 648 rp.Release(ctx) 649 return stat, nil 650 } 651 if !rp.handleError(ctx, err) { 652 rp.Release(ctx) 653 return linux.Statx{}, err 654 } 655 } 656 } 657 658 // StatFSAt returns metadata for the filesystem containing the file at the 659 // given path. 660 func (vfs *VirtualFilesystem) StatFSAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (linux.Statfs, error) { 661 rp := vfs.getResolvingPath(creds, pop) 662 for { 663 vfs.maybeBlockOnMountPromise(ctx, rp) 664 statfs, err := rp.mount.fs.impl.StatFSAt(ctx, rp) 665 if err == nil { 666 statfs.Flags |= rp.mount.MountFlags() 667 rp.Release(ctx) 668 return statfs, nil 669 } 670 if !rp.handleError(ctx, err) { 671 rp.Release(ctx) 672 return linux.Statfs{}, err 673 } 674 } 675 } 676 677 // SymlinkAt creates a symbolic link at the given path with the given target. 678 func (vfs *VirtualFilesystem) SymlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, target string) error { 679 if !pop.Path.Begin.Ok() { 680 // pop.Path should not be empty in operations that create/delete files. 681 // This is consistent with symlinkat(oldpath, newdirfd, ""). 682 if pop.Path.Absolute { 683 return linuxerr.EEXIST 684 } 685 return linuxerr.ENOENT 686 } 687 if pop.FollowFinalSymlink { 688 ctx.Warningf("VirtualFilesystem.SymlinkAt: file creation paths can't follow final symlink") 689 return linuxerr.EINVAL 690 } 691 692 rp := vfs.getResolvingPath(creds, pop) 693 for { 694 vfs.maybeBlockOnMountPromise(ctx, rp) 695 err := rp.mount.fs.impl.SymlinkAt(ctx, rp, target) 696 if err == nil { 697 rp.Release(ctx) 698 return nil 699 } 700 if checkInvariants { 701 if rp.canHandleError(err) && rp.Done() { 702 panic(fmt.Sprintf("%T.SymlinkAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) 703 } 704 } 705 if !rp.handleError(ctx, err) { 706 rp.Release(ctx) 707 return err 708 } 709 } 710 } 711 712 // UnlinkAt deletes the non-directory file at the given path. 713 func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) error { 714 if !pop.Path.Begin.Ok() { 715 // pop.Path should not be empty in operations that create/delete files. 716 // This is consistent with unlinkat(dirfd, "", 0). 717 if pop.Path.Absolute { 718 return linuxerr.EBUSY 719 } 720 return linuxerr.ENOENT 721 } 722 if pop.FollowFinalSymlink { 723 ctx.Warningf("VirtualFilesystem.UnlinkAt: file deletion paths can't follow final symlink") 724 return linuxerr.EINVAL 725 } 726 727 rp := vfs.getResolvingPath(creds, pop) 728 for { 729 vfs.maybeBlockOnMountPromise(ctx, rp) 730 err := rp.mount.fs.impl.UnlinkAt(ctx, rp) 731 if err == nil { 732 rp.Release(ctx) 733 return nil 734 } 735 if checkInvariants { 736 if rp.canHandleError(err) && rp.Done() { 737 panic(fmt.Sprintf("%T.UnlinkAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) 738 } 739 } 740 if !rp.handleError(ctx, err) { 741 rp.Release(ctx) 742 return err 743 } 744 } 745 } 746 747 // BoundEndpointAt gets the bound endpoint at the given path, if one exists. 748 func (vfs *VirtualFilesystem) BoundEndpointAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *BoundEndpointOptions) (transport.BoundEndpoint, error) { 749 rp := vfs.getResolvingPath(creds, pop) 750 for { 751 vfs.maybeBlockOnMountPromise(ctx, rp) 752 bep, err := rp.mount.fs.impl.BoundEndpointAt(ctx, rp, *opts) 753 if err == nil { 754 rp.Release(ctx) 755 return bep, nil 756 } 757 if checkInvariants { 758 if rp.canHandleError(err) && rp.Done() { 759 panic(fmt.Sprintf("%T.BoundEndpointAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) 760 } 761 } 762 if !rp.handleError(ctx, err) { 763 rp.Release(ctx) 764 return nil, err 765 } 766 } 767 } 768 769 // ListXattrAt returns all extended attribute names for the file at the given 770 // path. 771 func (vfs *VirtualFilesystem) ListXattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, size uint64) ([]string, error) { 772 rp := vfs.getResolvingPath(creds, pop) 773 for { 774 vfs.maybeBlockOnMountPromise(ctx, rp) 775 names, err := rp.mount.fs.impl.ListXattrAt(ctx, rp, size) 776 if err == nil { 777 rp.Release(ctx) 778 return names, nil 779 } 780 if linuxerr.Equals(linuxerr.EOPNOTSUPP, err) { 781 // Linux doesn't actually return EOPNOTSUPP in this case; instead, 782 // fs/xattr.c:vfs_listxattr() falls back to allowing the security 783 // subsystem to return security extended attributes, which by 784 // default don't exist. 785 rp.Release(ctx) 786 return nil, nil 787 } 788 if !rp.handleError(ctx, err) { 789 rp.Release(ctx) 790 return nil, err 791 } 792 } 793 } 794 795 // GetXattrAt returns the value associated with the given extended attribute 796 // for the file at the given path. 797 func (vfs *VirtualFilesystem) GetXattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetXattrOptions) (string, error) { 798 rp := vfs.getResolvingPath(creds, pop) 799 for { 800 vfs.maybeBlockOnMountPromise(ctx, rp) 801 val, err := rp.mount.fs.impl.GetXattrAt(ctx, rp, *opts) 802 if err == nil { 803 rp.Release(ctx) 804 return val, nil 805 } 806 if !rp.handleError(ctx, err) { 807 rp.Release(ctx) 808 return "", err 809 } 810 } 811 } 812 813 // SetXattrAt changes the value associated with the given extended attribute 814 // for the file at the given path. 815 func (vfs *VirtualFilesystem) SetXattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *SetXattrOptions) error { 816 rp := vfs.getResolvingPath(creds, pop) 817 for { 818 vfs.maybeBlockOnMountPromise(ctx, rp) 819 err := rp.mount.fs.impl.SetXattrAt(ctx, rp, *opts) 820 if err == nil { 821 rp.Release(ctx) 822 return nil 823 } 824 if !rp.handleError(ctx, err) { 825 rp.Release(ctx) 826 return err 827 } 828 } 829 } 830 831 // RemoveXattrAt removes the given extended attribute from the file at rp. 832 func (vfs *VirtualFilesystem) RemoveXattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, name string) error { 833 rp := vfs.getResolvingPath(creds, pop) 834 for { 835 vfs.maybeBlockOnMountPromise(ctx, rp) 836 err := rp.mount.fs.impl.RemoveXattrAt(ctx, rp, name) 837 if err == nil { 838 rp.Release(ctx) 839 return nil 840 } 841 if !rp.handleError(ctx, err) { 842 rp.Release(ctx) 843 return err 844 } 845 } 846 } 847 848 // SyncAllFilesystems has the semantics of Linux's sync(2). 849 func (vfs *VirtualFilesystem) SyncAllFilesystems(ctx context.Context) error { 850 var retErr error 851 for fs := range vfs.getFilesystems() { 852 if err := fs.impl.Sync(ctx); err != nil && retErr == nil { 853 retErr = err 854 } 855 fs.DecRef(ctx) 856 } 857 return retErr 858 } 859 860 func (vfs *VirtualFilesystem) getFilesystems() map[*Filesystem]struct{} { 861 fss := make(map[*Filesystem]struct{}) 862 vfs.filesystemsMu.Lock() 863 defer vfs.filesystemsMu.Unlock() 864 for fs := range vfs.filesystems { 865 if !fs.TryIncRef() { 866 continue 867 } 868 fss[fs] = struct{}{} 869 } 870 return fss 871 } 872 873 // MkdirAllAt recursively creates non-existent directories on the given path 874 // (including the last component). 875 func (vfs *VirtualFilesystem) MkdirAllAt(ctx context.Context, currentPath string, root VirtualDentry, creds *auth.Credentials, mkdirOpts *MkdirOptions, mustBeDir bool) error { 876 pop := &PathOperation{ 877 Root: root, 878 Start: root, 879 Path: fspath.Parse(currentPath), 880 } 881 stat, err := vfs.StatAt(ctx, creds, pop, &StatOptions{Mask: linux.STATX_TYPE}) 882 switch { 883 case err == nil: 884 if mustBeDir && (stat.Mask&linux.STATX_TYPE == 0 || stat.Mode&linux.FileTypeMask != linux.ModeDirectory) { 885 return linuxerr.ENOTDIR 886 } 887 // Directory already exists. 888 return nil 889 case linuxerr.Equals(linuxerr.ENOENT, err): 890 // Expected, we will create the dir. 891 default: 892 return fmt.Errorf("stat failed for %q during directory creation: %w", currentPath, err) 893 } 894 895 // Recurse to ensure parent is created and then create the final directory. 896 if err := vfs.MkdirAllAt(ctx, path.Dir(currentPath), root, creds, mkdirOpts, true /* mustBeDir */); err != nil { 897 return err 898 } 899 if err := vfs.MkdirAt(ctx, creds, pop, mkdirOpts); err != nil { 900 return fmt.Errorf("failed to create directory %q: %w", currentPath, err) 901 } 902 return nil 903 } 904 905 // MakeSyntheticMountpoint creates parent directories of target if they do not 906 // exist and attempts to create a directory for the mountpoint. If a 907 // non-directory file already exists there then we allow it. 908 func (vfs *VirtualFilesystem) MakeSyntheticMountpoint(ctx context.Context, target string, root VirtualDentry, creds *auth.Credentials) error { 909 mkdirOpts := &MkdirOptions{Mode: 0777, ForSyntheticMountpoint: true} 910 911 // Make sure the parent directory of target exists. 912 if err := vfs.MkdirAllAt(ctx, path.Dir(target), root, creds, mkdirOpts, true /* mustBeDir */); err != nil { 913 return fmt.Errorf("failed to create parent directory of mountpoint %q: %w", target, err) 914 } 915 916 // Attempt to mkdir the final component. If a file (of any type) exists 917 // then we let allow mounting on top of that because we do not require the 918 // target to be an existing directory, unlike Linux mount(2). 919 if err := vfs.MkdirAllAt(ctx, target, root, creds, mkdirOpts, false /* mustBeDir */); err != nil { 920 return fmt.Errorf("failed to create mountpoint %q: %w", target, err) 921 } 922 return nil 923 } 924 925 // RegisterMountPromise marks vd as a mount promise. This means any VFS 926 // operation on vd will be blocked until another process mounts over it or the 927 // mount promise times out. 928 func (vfs *VirtualFilesystem) RegisterMountPromise(vd VirtualDentry) error { 929 vfs.mountPromisesMu.Lock() 930 defer vfs.mountPromisesMu.Unlock() 931 if _, ok := vfs.mountPromises[vd]; ok { 932 return fmt.Errorf("mount promise for %v already exists", vd) 933 } 934 wq := &waiter.Queue{} 935 vfs.mountPromises[vd] = wq 936 return nil 937 } 938 939 // Emit a SentryMountPromiseBlockEvent and wait for the mount promise to be 940 // resolved or time out. 941 func (vfs *VirtualFilesystem) maybeBlockOnMountPromise(ctx context.Context, rp *ResolvingPath) { 942 vd := VirtualDentry{rp.mount, rp.start} 943 vfs.mountPromisesMu.RLock() 944 wq, ok := vfs.mountPromises[vd] 945 vfs.mountPromisesMu.RUnlock() 946 if !ok { 947 return 948 } 949 950 root := RootFromContext(ctx) 951 defer root.DecRef(ctx) 952 path, err := vfs.PathnameReachable(ctx, root, vd) 953 if err != nil { 954 panic(fmt.Sprintf("could not reach %v from root", rp.Component())) 955 } 956 e, ch := waiter.NewChannelEntry(waiter.EventOut) 957 wq.EventRegister(&e) 958 eventchannel.Emit(&epb.SentryMountPromiseBlockEvent{Path: path}) 959 960 select { 961 case <-ch: 962 // Update rp to point to the promised mount. 963 newMnt := vfs.getMountAt(ctx, rp.mount, rp.start) 964 rp.mount = newMnt 965 rp.start = newMnt.root 966 rp.flags = rp.flags&^rpflagsHaveStartRef | rpflagsHaveMountRef 967 case <-time.After(mountPromiseTimeout): 968 panic(fmt.Sprintf("mount promise for %s timed out, unable to proceed", path)) 969 } 970 } 971 972 func (vfs *VirtualFilesystem) maybeResolveMountPromise(vd VirtualDentry) { 973 vfs.mountPromisesMu.Lock() 974 defer vfs.mountPromisesMu.Unlock() 975 wq, ok := vfs.mountPromises[vd] 976 if !ok { 977 return 978 } 979 wq.Notify(waiter.EventOut) 980 delete(vfs.mountPromises, vd) 981 } 982 983 // PopDelayedDecRefs transfers the ownership of vfs.toDecRef to the caller via 984 // the returned list. It is the caller's responsibility to DecRef these object 985 // later. They must be DecRef'd outside of mountMu. 986 // 987 // +checklocks:vfs.mountMu 988 func (vfs *VirtualFilesystem) PopDelayedDecRefs() []refs.RefCounter { 989 var rcs []refs.RefCounter 990 for rc, refs := range vfs.toDecRef { 991 for i := 0; i < refs; i++ { 992 rcs = append(rcs, rc) 993 } 994 } 995 vfs.toDecRef = map[refs.RefCounter]int{} 996 return rcs 997 } 998 999 // delayDecRef saves a reference counted object so that it can be DecRef'd 1000 // outside of vfs.mountMu. This is necessary because filesystem locks possibly 1001 // taken by DentryImpl.DecRef() may precede vfs.mountMu in the lock order, and 1002 // Mount.DecRef() may lock vfs.mountMu. 1003 // 1004 // +checklocks:vfs.mountMu 1005 func (vfs *VirtualFilesystem) delayDecRef(rc refs.RefCounter) { 1006 vfs.toDecRef[rc]++ 1007 } 1008 1009 // Use this instead of vfs.mountMu.Lock(). 1010 // 1011 // +checklocksacquire:vfs.mountMu 1012 func (vfs *VirtualFilesystem) lockMounts() { 1013 vfs.mountMu.Lock() 1014 } 1015 1016 // Use this instead of vfs.mountMu.Unlock(). This method DecRefs any reference 1017 // counted objects that were collected while mountMu was held. 1018 // 1019 // +checklocksrelease:vfs.mountMu 1020 func (vfs *VirtualFilesystem) unlockMounts(ctx context.Context) { 1021 if len(vfs.toDecRef) == 0 { 1022 vfs.mountMu.Unlock() 1023 return 1024 } 1025 toDecRef := vfs.toDecRef 1026 vfs.toDecRef = map[refs.RefCounter]int{} 1027 vfs.mountMu.Unlock() 1028 for rc, refs := range toDecRef { 1029 for i := 0; i < refs; i++ { 1030 rc.DecRef(ctx) 1031 } 1032 } 1033 } 1034 1035 // A VirtualDentry represents a node in a VFS tree, by combining a Dentry 1036 // (which represents a node in a Filesystem's tree) and a Mount (which 1037 // represents the Filesystem's position in a VFS mount tree). 1038 // 1039 // VirtualDentry's semantics are similar to that of a Go interface object 1040 // representing a pointer: it is a copyable value type that represents 1041 // references to another entity. The zero value of VirtualDentry is an "empty 1042 // VirtualDentry", directly analogous to a nil interface object. 1043 // VirtualDentry.Ok() checks that a VirtualDentry is not zero-valued; unless 1044 // otherwise specified, all other VirtualDentry methods require 1045 // VirtualDentry.Ok() == true. 1046 // 1047 // Mounts and Dentries are reference-counted, requiring that users call 1048 // VirtualDentry.{Inc,Dec}Ref() as appropriate. We often colloquially refer to 1049 // references on the Mount and Dentry referred to by a VirtualDentry as 1050 // references on the VirtualDentry itself. Unless otherwise specified, all 1051 // VirtualDentry methods require that a reference is held on the VirtualDentry. 1052 // 1053 // VirtualDentry is analogous to Linux's struct path. 1054 // 1055 // +stateify savable 1056 type VirtualDentry struct { 1057 mount *Mount 1058 dentry *Dentry 1059 } 1060 1061 // MakeVirtualDentry creates a VirtualDentry. 1062 func MakeVirtualDentry(mount *Mount, dentry *Dentry) VirtualDentry { 1063 return VirtualDentry{ 1064 mount: mount, 1065 dentry: dentry, 1066 } 1067 } 1068 1069 // Ok returns true if vd is not empty. It does not require that a reference is 1070 // held. 1071 func (vd VirtualDentry) Ok() bool { 1072 return vd.mount != nil 1073 } 1074 1075 // IncRef increments the reference counts on the Mount and Dentry represented 1076 // by vd. 1077 func (vd VirtualDentry) IncRef() { 1078 vd.mount.IncRef() 1079 vd.dentry.IncRef() 1080 } 1081 1082 // DecRef decrements the reference counts on the Mount and Dentry represented 1083 // by vd. 1084 func (vd VirtualDentry) DecRef(ctx context.Context) { 1085 vd.dentry.DecRef(ctx) 1086 vd.mount.DecRef(ctx) 1087 } 1088 1089 // Mount returns the Mount associated with vd. It does not take a reference on 1090 // the returned Mount. 1091 func (vd VirtualDentry) Mount() *Mount { 1092 return vd.mount 1093 } 1094 1095 // Dentry returns the Dentry associated with vd. It does not take a reference 1096 // on the returned Dentry. 1097 func (vd VirtualDentry) Dentry() *Dentry { 1098 return vd.dentry 1099 }