github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/vfs/vfs.go (about) 1 // Copyright 2019 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package vfs implements a virtual filesystem layer. 16 // 17 // Lock order: 18 // 19 // EpollInstance.interestMu 20 // FileDescription.epollMu 21 // Locks acquired by FilesystemImpl/FileDescriptionImpl methods 22 // VirtualFilesystem.mountMu 23 // Dentry.mu 24 // Locks acquired by FilesystemImpls between Prepare{Delete,Rename}Dentry and Commit{Delete,Rename*}Dentry 25 // VirtualFilesystem.filesystemsMu 26 // fdnotifier.notifier.mu 27 // EpollInstance.readyMu 28 // Inotify.mu 29 // Watches.mu 30 // Inotify.evMu 31 // VirtualFilesystem.fsTypesMu 32 // 33 // Locking Dentry.mu in multiple Dentries requires holding 34 // VirtualFilesystem.mountMu. Locking EpollInstance.interestMu in multiple 35 // EpollInstances requires holding epollCycleMu. 36 package vfs 37 38 import ( 39 "fmt" 40 "path" 41 "time" 42 43 "github.com/nicocha30/gvisor-ligolo/pkg/abi/linux" 44 "github.com/nicocha30/gvisor-ligolo/pkg/atomicbitops" 45 "github.com/nicocha30/gvisor-ligolo/pkg/bitmap" 46 "github.com/nicocha30/gvisor-ligolo/pkg/context" 47 "github.com/nicocha30/gvisor-ligolo/pkg/errors/linuxerr" 48 "github.com/nicocha30/gvisor-ligolo/pkg/eventchannel" 49 "github.com/nicocha30/gvisor-ligolo/pkg/fspath" 50 "github.com/nicocha30/gvisor-ligolo/pkg/log" 51 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/fsmetric" 52 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel/auth" 53 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/socket/unix/transport" 54 epb "github.com/nicocha30/gvisor-ligolo/pkg/sentry/vfs/events_go_proto" 55 "github.com/nicocha30/gvisor-ligolo/pkg/sync" 56 "github.com/nicocha30/gvisor-ligolo/pkg/waiter" 57 ) 58 59 // How long to wait for a mount promise before proceeding with the VFS 60 // operation. This should be configurable by the user eventually. 61 const mountPromiseTimeout = 10 * time.Second 62 63 // A VirtualFilesystem (VFS for short) combines Filesystems in trees of Mounts. 64 // 65 // There is no analogue to the VirtualFilesystem type in Linux, as the 66 // equivalent state in Linux is global. 67 // 68 // +stateify savable 69 type VirtualFilesystem struct { 70 // mountMu serializes mount mutations. 71 // 72 // mountMu is analogous to Linux's namespace_sem. 73 mountMu virtualFilesystemMutex `state:"nosave"` 74 75 // mounts maps (mount parent, mount point) pairs to mounts. (Since mounts 76 // are uniquely namespaced, including mount parent in the key correctly 77 // handles both bind mounts and mount namespaces; Linux does the same.) 78 // Synchronization between mutators and readers is provided by mounts.seq; 79 // synchronization between mutators is provided by mountMu. 80 // 81 // mounts is used to follow mount points during path traversal. We use a 82 // single table rather than per-Dentry tables to reduce size (and therefore 83 // cache footprint) for the vast majority of Dentries that are not mount 84 // points. 85 // 86 // mounts is analogous to Linux's mount_hashtable. 87 mounts mountTable `state:".([]*Mount)"` 88 89 // mountpoints maps mount points to mounts at those points in all 90 // namespaces. mountpoints is protected by mountMu. 91 // 92 // mountpoints is used to find mounts that must be umounted due to 93 // removal of a mount point Dentry from another mount namespace. ("A file 94 // or directory that is a mount point in one namespace that is not a mount 95 // point in another namespace, may be renamed, unlinked, or removed 96 // (rmdir(2)) in the mount namespace in which it is not a mount point 97 // (subject to the usual permission checks)." - mount_namespaces(7)) 98 // 99 // mountpoints is analogous to Linux's mountpoint_hashtable. 100 mountpoints map[*Dentry]map[*Mount]struct{} 101 102 // lastMountID is the last allocated mount ID. lastMountID is accessed 103 // using atomic memory operations. 104 lastMountID atomicbitops.Uint64 105 106 // anonMount is a Mount, not included in mounts or mountpoints, 107 // representing an anonFilesystem. anonMount is used to back 108 // VirtualDentries returned by VirtualFilesystem.NewAnonVirtualDentry(). 109 // anonMount is immutable. 110 // 111 // anonMount is analogous to Linux's anon_inode_mnt. 112 anonMount *Mount 113 114 // devices contains all registered Devices. devices is protected by 115 // devicesMu. 116 devicesMu sync.RWMutex `state:"nosave"` 117 devices map[devTuple]*registeredDevice 118 119 // dynCharDevMajorUsed contains all allocated dynamic character device 120 // major numbers. dynCharDevMajor is protected by dynCharDevMajorMu. 121 dynCharDevMajorMu sync.Mutex `state:"nosave"` 122 dynCharDevMajorUsed map[uint32]struct{} 123 124 // anonBlockDevMinor contains all allocated anonymous block device minor 125 // numbers. anonBlockDevMinorNext is a lower bound for the smallest 126 // unallocated anonymous block device number. anonBlockDevMinorNext and 127 // anonBlockDevMinor are protected by anonBlockDevMinorMu. 128 anonBlockDevMinorMu sync.Mutex `state:"nosave"` 129 anonBlockDevMinorNext uint32 130 anonBlockDevMinor map[uint32]struct{} 131 132 // fsTypes contains all registered FilesystemTypes. fsTypes is protected by 133 // fsTypesMu. 134 fsTypesMu sync.RWMutex `state:"nosave"` 135 fsTypes map[string]*registeredFilesystemType 136 137 // filesystems contains all Filesystems. filesystems is protected by 138 // filesystemsMu. 139 filesystemsMu sync.Mutex `state:"nosave"` 140 filesystems map[*Filesystem]struct{} 141 142 // groupIDBitmap tracks which mount group IDs are available for allocation. 143 groupIDBitmap bitmap.Bitmap 144 145 // mountPromises contains all unresolved mount promises. 146 mountPromisesMu sync.RWMutex `state:"nosave"` 147 mountPromises map[VirtualDentry]*waiter.Queue 148 } 149 150 // Init initializes a new VirtualFilesystem with no mounts or FilesystemTypes. 151 func (vfs *VirtualFilesystem) Init(ctx context.Context) error { 152 if vfs.mountpoints != nil { 153 panic("VFS already initialized") 154 } 155 vfs.mountpoints = make(map[*Dentry]map[*Mount]struct{}) 156 vfs.devices = make(map[devTuple]*registeredDevice) 157 vfs.dynCharDevMajorUsed = make(map[uint32]struct{}) 158 vfs.anonBlockDevMinorNext = 1 159 vfs.anonBlockDevMinor = make(map[uint32]struct{}) 160 vfs.fsTypes = make(map[string]*registeredFilesystemType) 161 vfs.filesystems = make(map[*Filesystem]struct{}) 162 vfs.mounts.Init() 163 vfs.groupIDBitmap = bitmap.New(1024) 164 vfs.mountPromises = make(map[VirtualDentry]*waiter.Queue) 165 166 // Construct vfs.anonMount. 167 anonfsDevMinor, err := vfs.GetAnonBlockDevMinor() 168 if err != nil { 169 // This shouldn't be possible since anonBlockDevMinorNext was 170 // initialized to 1 above (no device numbers have been allocated yet). 171 panic(fmt.Sprintf("VirtualFilesystem.Init: device number allocation for anonfs failed: %v", err)) 172 } 173 anonfs := anonFilesystem{ 174 devMinor: anonfsDevMinor, 175 } 176 anonfs.vfsfs.Init(vfs, &anonFilesystemType{}, &anonfs) 177 defer anonfs.vfsfs.DecRef(ctx) 178 anonMount := vfs.NewDisconnectedMount(&anonfs.vfsfs, nil, &MountOptions{}) 179 vfs.anonMount = anonMount 180 181 return nil 182 } 183 184 // Release drops references on filesystem objects held by vfs. 185 // 186 // Precondition: This must be called after VFS.Init() has succeeded. 187 func (vfs *VirtualFilesystem) Release(ctx context.Context) { 188 vfs.anonMount.DecRef(ctx) 189 for _, fst := range vfs.fsTypes { 190 fst.fsType.Release(ctx) 191 } 192 } 193 194 // PathOperation specifies the path operated on by a VFS method. 195 // 196 // PathOperation is passed to VFS methods by pointer to reduce memory copying: 197 // it's somewhat large and should never escape. (Options structs are passed by 198 // pointer to VFS and FileDescription methods for the same reason.) 199 // 200 // +stateify savable 201 type PathOperation struct { 202 // Root is the VFS root. References on Root are borrowed from the provider 203 // of the PathOperation. 204 // 205 // Invariants: Root.Ok(). 206 Root VirtualDentry 207 208 // Start is the starting point for the path traversal. References on Start 209 // are borrowed from the provider of the PathOperation (i.e. the caller of 210 // the VFS method to which the PathOperation was passed). 211 // 212 // Invariants: Start.Ok(). If Path.Absolute, then Start == Root. 213 Start VirtualDentry 214 215 // Path is the pathname traversed by this operation. 216 Path fspath.Path 217 218 // If FollowFinalSymlink is true, and the Dentry traversed by the final 219 // path component represents a symbolic link, the symbolic link should be 220 // followed. 221 FollowFinalSymlink bool 222 } 223 224 // AccessAt checks whether a user with creds has access to the file at 225 // the given path. 226 func (vfs *VirtualFilesystem) AccessAt(ctx context.Context, creds *auth.Credentials, ats AccessTypes, pop *PathOperation) error { 227 rp := vfs.getResolvingPath(creds, pop) 228 for { 229 vfs.maybeBlockOnMountPromise(ctx, rp) 230 err := rp.mount.fs.impl.AccessAt(ctx, rp, creds, ats) 231 if err == nil { 232 rp.Release(ctx) 233 return nil 234 } 235 if !rp.handleError(ctx, err) { 236 rp.Release(ctx) 237 return err 238 } 239 } 240 } 241 242 // GetDentryAt returns a VirtualDentry representing the given path, at which a 243 // file must exist. A reference is taken on the returned VirtualDentry. 244 func (vfs *VirtualFilesystem) GetDentryAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetDentryOptions) (VirtualDentry, error) { 245 rp := vfs.getResolvingPath(creds, pop) 246 for { 247 vfs.maybeBlockOnMountPromise(ctx, rp) 248 d, err := rp.mount.fs.impl.GetDentryAt(ctx, rp, *opts) 249 if err == nil { 250 vd := VirtualDentry{ 251 mount: rp.mount, 252 dentry: d, 253 } 254 rp.mount.IncRef() 255 rp.Release(ctx) 256 return vd, nil 257 } 258 if !rp.handleError(ctx, err) { 259 rp.Release(ctx) 260 return VirtualDentry{}, err 261 } 262 } 263 } 264 265 // Preconditions: pop.Path.Begin.Ok(). 266 func (vfs *VirtualFilesystem) getParentDirAndName(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (VirtualDentry, string, error) { 267 rp := vfs.getResolvingPath(creds, pop) 268 for { 269 vfs.maybeBlockOnMountPromise(ctx, rp) 270 parent, err := rp.mount.fs.impl.GetParentDentryAt(ctx, rp) 271 if err == nil { 272 parentVD := VirtualDentry{ 273 mount: rp.mount, 274 dentry: parent, 275 } 276 rp.mount.IncRef() 277 name := rp.Component() 278 rp.Release(ctx) 279 return parentVD, name, nil 280 } 281 if checkInvariants { 282 if rp.canHandleError(err) && rp.Done() { 283 panic(fmt.Sprintf("%T.GetParentDentryAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) 284 } 285 } 286 if !rp.handleError(ctx, err) { 287 rp.Release(ctx) 288 return VirtualDentry{}, "", err 289 } 290 } 291 } 292 293 // LinkAt creates a hard link at newpop representing the existing file at 294 // oldpop. 295 func (vfs *VirtualFilesystem) LinkAt(ctx context.Context, creds *auth.Credentials, oldpop, newpop *PathOperation) error { 296 oldVD, err := vfs.GetDentryAt(ctx, creds, oldpop, &GetDentryOptions{}) 297 if err != nil { 298 return err 299 } 300 301 if !newpop.Path.Begin.Ok() { 302 oldVD.DecRef(ctx) 303 if newpop.Path.Absolute { 304 return linuxerr.EEXIST 305 } 306 return linuxerr.ENOENT 307 } 308 if newpop.FollowFinalSymlink { 309 oldVD.DecRef(ctx) 310 ctx.Warningf("VirtualFilesystem.LinkAt: file creation paths can't follow final symlink") 311 return linuxerr.EINVAL 312 } 313 314 rp := vfs.getResolvingPath(creds, newpop) 315 for { 316 vfs.maybeBlockOnMountPromise(ctx, rp) 317 err := rp.mount.fs.impl.LinkAt(ctx, rp, oldVD) 318 if err == nil { 319 rp.Release(ctx) 320 oldVD.DecRef(ctx) 321 return nil 322 } 323 if checkInvariants { 324 if rp.canHandleError(err) && rp.Done() { 325 panic(fmt.Sprintf("%T.LinkAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) 326 } 327 } 328 if !rp.handleError(ctx, err) { 329 rp.Release(ctx) 330 oldVD.DecRef(ctx) 331 return err 332 } 333 } 334 } 335 336 // MkdirAt creates a directory at the given path. 337 func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MkdirOptions) error { 338 if !pop.Path.Begin.Ok() { 339 // pop.Path should not be empty in operations that create/delete files. 340 // This is consistent with mkdirat(dirfd, "", mode). 341 if pop.Path.Absolute { 342 return linuxerr.EEXIST 343 } 344 return linuxerr.ENOENT 345 } 346 if pop.FollowFinalSymlink { 347 ctx.Warningf("VirtualFilesystem.MkdirAt: file creation paths can't follow final symlink") 348 return linuxerr.EINVAL 349 } 350 // "Under Linux, apart from the permission bits, the S_ISVTX mode bit is 351 // also honored." - mkdir(2) 352 opts.Mode &= 0777 | linux.S_ISVTX 353 354 rp := vfs.getResolvingPath(creds, pop) 355 for { 356 vfs.maybeBlockOnMountPromise(ctx, rp) 357 err := rp.mount.fs.impl.MkdirAt(ctx, rp, *opts) 358 if err == nil { 359 rp.Release(ctx) 360 return nil 361 } 362 if checkInvariants { 363 if rp.canHandleError(err) && rp.Done() { 364 panic(fmt.Sprintf("%T.MkdirAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) 365 } 366 } 367 if !rp.handleError(ctx, err) { 368 rp.Release(ctx) 369 return err 370 } 371 } 372 } 373 374 // MknodAt creates a file of the given mode at the given path. It returns an 375 // error from the linuxerr package. 376 func (vfs *VirtualFilesystem) MknodAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MknodOptions) error { 377 if !pop.Path.Begin.Ok() { 378 // pop.Path should not be empty in operations that create/delete files. 379 // This is consistent with mknodat(dirfd, "", mode, dev). 380 if pop.Path.Absolute { 381 return linuxerr.EEXIST 382 } 383 return linuxerr.ENOENT 384 } 385 if pop.FollowFinalSymlink { 386 ctx.Warningf("VirtualFilesystem.MknodAt: file creation paths can't follow final symlink") 387 return linuxerr.EINVAL 388 } 389 390 rp := vfs.getResolvingPath(creds, pop) 391 for { 392 vfs.maybeBlockOnMountPromise(ctx, rp) 393 err := rp.mount.fs.impl.MknodAt(ctx, rp, *opts) 394 if err == nil { 395 rp.Release(ctx) 396 return nil 397 } 398 if checkInvariants { 399 if rp.canHandleError(err) && rp.Done() { 400 panic(fmt.Sprintf("%T.MknodAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) 401 } 402 } 403 if !rp.handleError(ctx, err) { 404 rp.Release(ctx) 405 return err 406 } 407 } 408 } 409 410 // OpenAt returns a FileDescription providing access to the file at the given 411 // path. A reference is taken on the returned FileDescription. 412 func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *OpenOptions) (*FileDescription, error) { 413 fsmetric.Opens.Increment() 414 415 // Remove: 416 // 417 // - O_CLOEXEC, which affects file descriptors and therefore must be 418 // handled outside of VFS. 419 // 420 // - Unknown flags. 421 opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC | linux.O_APPEND | linux.O_NONBLOCK | linux.O_DSYNC | linux.O_ASYNC | linux.O_DIRECT | linux.O_LARGEFILE | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NOATIME | linux.O_SYNC | linux.O_PATH | linux.O_TMPFILE 422 // Linux's __O_SYNC (which we call linux.O_SYNC) implies O_DSYNC. 423 if opts.Flags&linux.O_SYNC != 0 { 424 opts.Flags |= linux.O_DSYNC 425 } 426 // Linux's __O_TMPFILE (which we call linux.O_TMPFILE) must be specified 427 // with O_DIRECTORY and a writable access mode (to ensure that it fails on 428 // filesystem implementations that do not support it). 429 if opts.Flags&linux.O_TMPFILE != 0 { 430 if opts.Flags&linux.O_DIRECTORY == 0 { 431 return nil, linuxerr.EINVAL 432 } 433 if opts.Flags&linux.O_CREAT != 0 { 434 return nil, linuxerr.EINVAL 435 } 436 if opts.Flags&linux.O_ACCMODE == linux.O_RDONLY { 437 return nil, linuxerr.EINVAL 438 } 439 } 440 // O_PATH causes most other flags to be ignored. 441 if opts.Flags&linux.O_PATH != 0 { 442 opts.Flags &= linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_PATH 443 } 444 // "On Linux, the following bits are also honored in mode: [S_ISUID, 445 // S_ISGID, S_ISVTX]" - open(2) 446 opts.Mode &= 0777 | linux.S_ISUID | linux.S_ISGID | linux.S_ISVTX 447 448 if opts.Flags&linux.O_NOFOLLOW != 0 { 449 pop.FollowFinalSymlink = false 450 } 451 if opts.Flags&linux.O_PATH != 0 { 452 return vfs.openOPathFD(ctx, creds, pop, opts.Flags) 453 } 454 rp := vfs.getResolvingPath(creds, pop) 455 if opts.Flags&linux.O_DIRECTORY != 0 { 456 rp.mustBeDir = true 457 } 458 for { 459 vfs.maybeBlockOnMountPromise(ctx, rp) 460 fd, err := rp.mount.fs.impl.OpenAt(ctx, rp, *opts) 461 if err == nil { 462 rp.Release(ctx) 463 464 if opts.FileExec { 465 if fd.Mount().Flags.NoExec { 466 fd.DecRef(ctx) 467 return nil, linuxerr.EACCES 468 } 469 470 // Only a regular file can be executed. 471 stat, err := fd.Stat(ctx, StatOptions{Mask: linux.STATX_TYPE}) 472 if err != nil { 473 fd.DecRef(ctx) 474 return nil, err 475 } 476 if stat.Mask&linux.STATX_TYPE == 0 || stat.Mode&linux.S_IFMT != linux.S_IFREG { 477 fd.DecRef(ctx) 478 return nil, linuxerr.EACCES 479 } 480 } 481 482 fd.Dentry().InotifyWithParent(ctx, linux.IN_OPEN, 0, PathEvent) 483 return fd, nil 484 } 485 if !rp.handleError(ctx, err) { 486 rp.Release(ctx) 487 return nil, err 488 } 489 } 490 } 491 492 // ReadlinkAt returns the target of the symbolic link at the given path. 493 func (vfs *VirtualFilesystem) ReadlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (string, error) { 494 rp := vfs.getResolvingPath(creds, pop) 495 for { 496 vfs.maybeBlockOnMountPromise(ctx, rp) 497 target, err := rp.mount.fs.impl.ReadlinkAt(ctx, rp) 498 if err == nil { 499 rp.Release(ctx) 500 return target, nil 501 } 502 if !rp.handleError(ctx, err) { 503 rp.Release(ctx) 504 return "", err 505 } 506 } 507 } 508 509 // RenameAt renames the file at oldpop to newpop. 510 func (vfs *VirtualFilesystem) RenameAt(ctx context.Context, creds *auth.Credentials, oldpop, newpop *PathOperation, opts *RenameOptions) error { 511 if !oldpop.Path.Begin.Ok() { 512 if oldpop.Path.Absolute { 513 return linuxerr.EBUSY 514 } 515 return linuxerr.ENOENT 516 } 517 if oldpop.FollowFinalSymlink { 518 ctx.Warningf("VirtualFilesystem.RenameAt: source path can't follow final symlink") 519 return linuxerr.EINVAL 520 } 521 522 oldParentVD, oldName, err := vfs.getParentDirAndName(ctx, creds, oldpop) 523 if err != nil { 524 return err 525 } 526 if oldName == "." || oldName == ".." { 527 oldParentVD.DecRef(ctx) 528 return linuxerr.EBUSY 529 } 530 if len(oldName) > linux.NAME_MAX { 531 oldParentVD.DecRef(ctx) 532 return linuxerr.ENAMETOOLONG 533 } 534 535 if !newpop.Path.Begin.Ok() { 536 oldParentVD.DecRef(ctx) 537 if newpop.Path.Absolute { 538 return linuxerr.EBUSY 539 } 540 return linuxerr.ENOENT 541 } 542 if newpop.FollowFinalSymlink { 543 oldParentVD.DecRef(ctx) 544 ctx.Warningf("VirtualFilesystem.RenameAt: destination path can't follow final symlink") 545 return linuxerr.EINVAL 546 } 547 548 rp := vfs.getResolvingPath(creds, newpop) 549 renameOpts := *opts 550 if oldpop.Path.Dir { 551 renameOpts.MustBeDir = true 552 } 553 for { 554 vfs.maybeBlockOnMountPromise(ctx, rp) 555 err := rp.mount.fs.impl.RenameAt(ctx, rp, oldParentVD, oldName, renameOpts) 556 if err == nil { 557 rp.Release(ctx) 558 oldParentVD.DecRef(ctx) 559 return nil 560 } 561 if checkInvariants { 562 if rp.canHandleError(err) && rp.Done() { 563 panic(fmt.Sprintf("%T.RenameAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) 564 } 565 } 566 if !rp.handleError(ctx, err) { 567 rp.Release(ctx) 568 oldParentVD.DecRef(ctx) 569 return err 570 } 571 } 572 } 573 574 // RmdirAt removes the directory at the given path. 575 func (vfs *VirtualFilesystem) RmdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) error { 576 if !pop.Path.Begin.Ok() { 577 // pop.Path should not be empty in operations that create/delete files. 578 // This is consistent with unlinkat(dirfd, "", AT_REMOVEDIR). 579 if pop.Path.Absolute { 580 return linuxerr.EBUSY 581 } 582 return linuxerr.ENOENT 583 } 584 if pop.FollowFinalSymlink { 585 ctx.Warningf("VirtualFilesystem.RmdirAt: file deletion paths can't follow final symlink") 586 return linuxerr.EINVAL 587 } 588 589 rp := vfs.getResolvingPath(creds, pop) 590 for { 591 vfs.maybeBlockOnMountPromise(ctx, rp) 592 err := rp.mount.fs.impl.RmdirAt(ctx, rp) 593 if err == nil { 594 rp.Release(ctx) 595 return nil 596 } 597 if checkInvariants { 598 if rp.canHandleError(err) && rp.Done() { 599 panic(fmt.Sprintf("%T.RmdirAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) 600 } 601 } 602 if !rp.handleError(ctx, err) { 603 rp.Release(ctx) 604 return err 605 } 606 } 607 } 608 609 // SetStatAt changes metadata for the file at the given path. 610 func (vfs *VirtualFilesystem) SetStatAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *SetStatOptions) error { 611 rp := vfs.getResolvingPath(creds, pop) 612 for { 613 vfs.maybeBlockOnMountPromise(ctx, rp) 614 err := rp.mount.fs.impl.SetStatAt(ctx, rp, *opts) 615 if err == nil { 616 rp.Release(ctx) 617 return nil 618 } 619 if !rp.handleError(ctx, err) { 620 rp.Release(ctx) 621 return err 622 } 623 } 624 } 625 626 // StatAt returns metadata for the file at the given path. 627 func (vfs *VirtualFilesystem) StatAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *StatOptions) (linux.Statx, error) { 628 rp := vfs.getResolvingPath(creds, pop) 629 for { 630 vfs.maybeBlockOnMountPromise(ctx, rp) 631 stat, err := rp.mount.fs.impl.StatAt(ctx, rp, *opts) 632 if err == nil { 633 rp.Release(ctx) 634 return stat, nil 635 } 636 if !rp.handleError(ctx, err) { 637 rp.Release(ctx) 638 return linux.Statx{}, err 639 } 640 } 641 } 642 643 // StatFSAt returns metadata for the filesystem containing the file at the 644 // given path. 645 func (vfs *VirtualFilesystem) StatFSAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) (linux.Statfs, error) { 646 rp := vfs.getResolvingPath(creds, pop) 647 for { 648 vfs.maybeBlockOnMountPromise(ctx, rp) 649 statfs, err := rp.mount.fs.impl.StatFSAt(ctx, rp) 650 if err == nil { 651 rp.Release(ctx) 652 return statfs, nil 653 } 654 if !rp.handleError(ctx, err) { 655 rp.Release(ctx) 656 return linux.Statfs{}, err 657 } 658 } 659 } 660 661 // SymlinkAt creates a symbolic link at the given path with the given target. 662 func (vfs *VirtualFilesystem) SymlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, target string) error { 663 if !pop.Path.Begin.Ok() { 664 // pop.Path should not be empty in operations that create/delete files. 665 // This is consistent with symlinkat(oldpath, newdirfd, ""). 666 if pop.Path.Absolute { 667 return linuxerr.EEXIST 668 } 669 return linuxerr.ENOENT 670 } 671 if pop.FollowFinalSymlink { 672 ctx.Warningf("VirtualFilesystem.SymlinkAt: file creation paths can't follow final symlink") 673 return linuxerr.EINVAL 674 } 675 676 rp := vfs.getResolvingPath(creds, pop) 677 for { 678 vfs.maybeBlockOnMountPromise(ctx, rp) 679 err := rp.mount.fs.impl.SymlinkAt(ctx, rp, target) 680 if err == nil { 681 rp.Release(ctx) 682 return nil 683 } 684 if checkInvariants { 685 if rp.canHandleError(err) && rp.Done() { 686 panic(fmt.Sprintf("%T.SymlinkAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) 687 } 688 } 689 if !rp.handleError(ctx, err) { 690 rp.Release(ctx) 691 return err 692 } 693 } 694 } 695 696 // UnlinkAt deletes the non-directory file at the given path. 697 func (vfs *VirtualFilesystem) UnlinkAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation) error { 698 if !pop.Path.Begin.Ok() { 699 // pop.Path should not be empty in operations that create/delete files. 700 // This is consistent with unlinkat(dirfd, "", 0). 701 if pop.Path.Absolute { 702 return linuxerr.EBUSY 703 } 704 return linuxerr.ENOENT 705 } 706 if pop.FollowFinalSymlink { 707 ctx.Warningf("VirtualFilesystem.UnlinkAt: file deletion paths can't follow final symlink") 708 return linuxerr.EINVAL 709 } 710 711 rp := vfs.getResolvingPath(creds, pop) 712 for { 713 vfs.maybeBlockOnMountPromise(ctx, rp) 714 err := rp.mount.fs.impl.UnlinkAt(ctx, rp) 715 if err == nil { 716 rp.Release(ctx) 717 return nil 718 } 719 if checkInvariants { 720 if rp.canHandleError(err) && rp.Done() { 721 panic(fmt.Sprintf("%T.UnlinkAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) 722 } 723 } 724 if !rp.handleError(ctx, err) { 725 rp.Release(ctx) 726 return err 727 } 728 } 729 } 730 731 // BoundEndpointAt gets the bound endpoint at the given path, if one exists. 732 func (vfs *VirtualFilesystem) BoundEndpointAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *BoundEndpointOptions) (transport.BoundEndpoint, error) { 733 rp := vfs.getResolvingPath(creds, pop) 734 for { 735 vfs.maybeBlockOnMountPromise(ctx, rp) 736 bep, err := rp.mount.fs.impl.BoundEndpointAt(ctx, rp, *opts) 737 if err == nil { 738 rp.Release(ctx) 739 return bep, nil 740 } 741 if checkInvariants { 742 if rp.canHandleError(err) && rp.Done() { 743 panic(fmt.Sprintf("%T.BoundEndpointAt() consumed all path components and returned %v", rp.mount.fs.impl, err)) 744 } 745 } 746 if !rp.handleError(ctx, err) { 747 rp.Release(ctx) 748 return nil, err 749 } 750 } 751 } 752 753 // ListXattrAt returns all extended attribute names for the file at the given 754 // path. 755 func (vfs *VirtualFilesystem) ListXattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, size uint64) ([]string, error) { 756 rp := vfs.getResolvingPath(creds, pop) 757 for { 758 vfs.maybeBlockOnMountPromise(ctx, rp) 759 names, err := rp.mount.fs.impl.ListXattrAt(ctx, rp, size) 760 if err == nil { 761 rp.Release(ctx) 762 return names, nil 763 } 764 if linuxerr.Equals(linuxerr.EOPNOTSUPP, err) { 765 // Linux doesn't actually return EOPNOTSUPP in this case; instead, 766 // fs/xattr.c:vfs_listxattr() falls back to allowing the security 767 // subsystem to return security extended attributes, which by 768 // default don't exist. 769 rp.Release(ctx) 770 return nil, nil 771 } 772 if !rp.handleError(ctx, err) { 773 rp.Release(ctx) 774 return nil, err 775 } 776 } 777 } 778 779 // GetXattrAt returns the value associated with the given extended attribute 780 // for the file at the given path. 781 func (vfs *VirtualFilesystem) GetXattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetXattrOptions) (string, error) { 782 rp := vfs.getResolvingPath(creds, pop) 783 for { 784 vfs.maybeBlockOnMountPromise(ctx, rp) 785 val, err := rp.mount.fs.impl.GetXattrAt(ctx, rp, *opts) 786 if err == nil { 787 rp.Release(ctx) 788 return val, nil 789 } 790 if !rp.handleError(ctx, err) { 791 rp.Release(ctx) 792 return "", err 793 } 794 } 795 } 796 797 // SetXattrAt changes the value associated with the given extended attribute 798 // for the file at the given path. 799 func (vfs *VirtualFilesystem) SetXattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *SetXattrOptions) error { 800 rp := vfs.getResolvingPath(creds, pop) 801 for { 802 vfs.maybeBlockOnMountPromise(ctx, rp) 803 err := rp.mount.fs.impl.SetXattrAt(ctx, rp, *opts) 804 if err == nil { 805 rp.Release(ctx) 806 return nil 807 } 808 if !rp.handleError(ctx, err) { 809 rp.Release(ctx) 810 return err 811 } 812 } 813 } 814 815 // RemoveXattrAt removes the given extended attribute from the file at rp. 816 func (vfs *VirtualFilesystem) RemoveXattrAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, name string) error { 817 rp := vfs.getResolvingPath(creds, pop) 818 for { 819 vfs.maybeBlockOnMountPromise(ctx, rp) 820 err := rp.mount.fs.impl.RemoveXattrAt(ctx, rp, name) 821 if err == nil { 822 rp.Release(ctx) 823 return nil 824 } 825 if !rp.handleError(ctx, err) { 826 rp.Release(ctx) 827 return err 828 } 829 } 830 } 831 832 // SyncAllFilesystems has the semantics of Linux's sync(2). 833 func (vfs *VirtualFilesystem) SyncAllFilesystems(ctx context.Context) error { 834 var retErr error 835 for fs := range vfs.getFilesystems() { 836 if err := fs.impl.Sync(ctx); err != nil && retErr == nil { 837 retErr = err 838 } 839 fs.DecRef(ctx) 840 } 841 return retErr 842 } 843 844 func (vfs *VirtualFilesystem) getFilesystems() map[*Filesystem]struct{} { 845 fss := make(map[*Filesystem]struct{}) 846 vfs.filesystemsMu.Lock() 847 defer vfs.filesystemsMu.Unlock() 848 for fs := range vfs.filesystems { 849 if !fs.TryIncRef() { 850 continue 851 } 852 fss[fs] = struct{}{} 853 } 854 return fss 855 } 856 857 // MkdirAllAt recursively creates non-existent directories on the given path 858 // (including the last component). 859 func (vfs *VirtualFilesystem) MkdirAllAt(ctx context.Context, currentPath string, root VirtualDentry, creds *auth.Credentials, mkdirOpts *MkdirOptions, mustBeDir bool) error { 860 pop := &PathOperation{ 861 Root: root, 862 Start: root, 863 Path: fspath.Parse(currentPath), 864 } 865 stat, err := vfs.StatAt(ctx, creds, pop, &StatOptions{Mask: linux.STATX_TYPE}) 866 switch { 867 case err == nil: 868 if mustBeDir && (stat.Mask&linux.STATX_TYPE == 0 || stat.Mode&linux.FileTypeMask != linux.ModeDirectory) { 869 return linuxerr.ENOTDIR 870 } 871 // Directory already exists. 872 return nil 873 case linuxerr.Equals(linuxerr.ENOENT, err): 874 // Expected, we will create the dir. 875 default: 876 return fmt.Errorf("stat failed for %q during directory creation: %w", currentPath, err) 877 } 878 879 // Recurse to ensure parent is created and then create the final directory. 880 if err := vfs.MkdirAllAt(ctx, path.Dir(currentPath), root, creds, mkdirOpts, true /* mustBeDir */); err != nil { 881 return err 882 } 883 if err := vfs.MkdirAt(ctx, creds, pop, mkdirOpts); err != nil { 884 return fmt.Errorf("failed to create directory %q: %w", currentPath, err) 885 } 886 return nil 887 } 888 889 // MakeSyntheticMountpoint creates parent directories of target if they do not 890 // exist and attempts to create a directory for the mountpoint. If a 891 // non-directory file already exists there then we allow it. 892 func (vfs *VirtualFilesystem) MakeSyntheticMountpoint(ctx context.Context, target string, root VirtualDentry, creds *auth.Credentials) error { 893 mkdirOpts := &MkdirOptions{Mode: 0777, ForSyntheticMountpoint: true} 894 895 // Make sure the parent directory of target exists. 896 if err := vfs.MkdirAllAt(ctx, path.Dir(target), root, creds, mkdirOpts, true /* mustBeDir */); err != nil { 897 return fmt.Errorf("failed to create parent directory of mountpoint %q: %w", target, err) 898 } 899 900 // Attempt to mkdir the final component. If a file (of any type) exists 901 // then we let allow mounting on top of that because we do not require the 902 // target to be an existing directory, unlike Linux mount(2). 903 if err := vfs.MkdirAllAt(ctx, target, root, creds, mkdirOpts, false /* mustBeDir */); err != nil { 904 return fmt.Errorf("failed to create mountpoint %q: %w", target, err) 905 } 906 return nil 907 } 908 909 // RegisterMountPromise marks vd as a mount promise. This means any VFS 910 // operation on vd will be blocked until another process mounts over it or the 911 // mount promise times out. 912 func (vfs *VirtualFilesystem) RegisterMountPromise(vd VirtualDentry) error { 913 vfs.mountPromisesMu.Lock() 914 defer vfs.mountPromisesMu.Unlock() 915 if _, ok := vfs.mountPromises[vd]; ok { 916 return fmt.Errorf("mount promise for %v already exists", vd) 917 } 918 wq := &waiter.Queue{} 919 vfs.mountPromises[vd] = wq 920 return nil 921 } 922 923 // Emit a SentryMountPromiseBlockEvent and wait for the mount promise to be 924 // resolved or time out. 925 func (vfs *VirtualFilesystem) maybeBlockOnMountPromise(ctx context.Context, rp *ResolvingPath) { 926 vd := VirtualDentry{rp.mount, rp.start} 927 vfs.mountPromisesMu.RLock() 928 wq, ok := vfs.mountPromises[vd] 929 vfs.mountPromisesMu.RUnlock() 930 if !ok { 931 return 932 } 933 934 path, err := vfs.PathnameReachable(ctx, rp.root, vd) 935 if err != nil { 936 panic(fmt.Sprintf("could not reach %v from root", rp.Component())) 937 } 938 e, ch := waiter.NewChannelEntry(waiter.EventOut) 939 wq.EventRegister(&e) 940 eventchannel.Emit(&epb.SentryMountPromiseBlockEvent{Path: path}) 941 942 select { 943 case <-ch: 944 // Update rp to point to the promised mount. 945 newMnt := vfs.getMountAt(ctx, rp.mount, rp.start) 946 rp.mount = newMnt 947 rp.start = newMnt.root 948 rp.flags = rp.flags&^rpflagsHaveStartRef | rpflagsHaveMountRef 949 case <-time.After(mountPromiseTimeout): 950 log.Warningf("mount promise for %s timed out, proceeding with VFS operation", path) 951 } 952 } 953 954 func (vfs *VirtualFilesystem) maybeResolveMountPromise(vd VirtualDentry) { 955 vfs.mountPromisesMu.Lock() 956 defer vfs.mountPromisesMu.Unlock() 957 wq, ok := vfs.mountPromises[vd] 958 if !ok { 959 return 960 } 961 wq.Notify(waiter.EventOut) 962 delete(vfs.mountPromises, vd) 963 } 964 965 // A VirtualDentry represents a node in a VFS tree, by combining a Dentry 966 // (which represents a node in a Filesystem's tree) and a Mount (which 967 // represents the Filesystem's position in a VFS mount tree). 968 // 969 // VirtualDentry's semantics are similar to that of a Go interface object 970 // representing a pointer: it is a copyable value type that represents 971 // references to another entity. The zero value of VirtualDentry is an "empty 972 // VirtualDentry", directly analogous to a nil interface object. 973 // VirtualDentry.Ok() checks that a VirtualDentry is not zero-valued; unless 974 // otherwise specified, all other VirtualDentry methods require 975 // VirtualDentry.Ok() == true. 976 // 977 // Mounts and Dentries are reference-counted, requiring that users call 978 // VirtualDentry.{Inc,Dec}Ref() as appropriate. We often colloquially refer to 979 // references on the Mount and Dentry referred to by a VirtualDentry as 980 // references on the VirtualDentry itself. Unless otherwise specified, all 981 // VirtualDentry methods require that a reference is held on the VirtualDentry. 982 // 983 // VirtualDentry is analogous to Linux's struct path. 984 // 985 // +stateify savable 986 type VirtualDentry struct { 987 mount *Mount 988 dentry *Dentry 989 } 990 991 // MakeVirtualDentry creates a VirtualDentry. 992 func MakeVirtualDentry(mount *Mount, dentry *Dentry) VirtualDentry { 993 return VirtualDentry{ 994 mount: mount, 995 dentry: dentry, 996 } 997 } 998 999 // Ok returns true if vd is not empty. It does not require that a reference is 1000 // held. 1001 func (vd VirtualDentry) Ok() bool { 1002 return vd.mount != nil 1003 } 1004 1005 // IncRef increments the reference counts on the Mount and Dentry represented 1006 // by vd. 1007 func (vd VirtualDentry) IncRef() { 1008 vd.mount.IncRef() 1009 vd.dentry.IncRef() 1010 } 1011 1012 // DecRef decrements the reference counts on the Mount and Dentry represented 1013 // by vd. 1014 func (vd VirtualDentry) DecRef(ctx context.Context) { 1015 vd.dentry.DecRef(ctx) 1016 vd.mount.DecRef(ctx) 1017 } 1018 1019 // Mount returns the Mount associated with vd. It does not take a reference on 1020 // the returned Mount. 1021 func (vd VirtualDentry) Mount() *Mount { 1022 return vd.mount 1023 } 1024 1025 // Dentry returns the Dentry associated with vd. It does not take a reference 1026 // on the returned Dentry. 1027 func (vd VirtualDentry) Dentry() *Dentry { 1028 return vd.dentry 1029 }