gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/fsimpl/kernfs/filesystem.go (about) 1 // Copyright 2019 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package kernfs 16 17 // This file implements vfs.FilesystemImpl for kernfs. 18 19 import ( 20 "fmt" 21 22 "gvisor.dev/gvisor/pkg/abi/linux" 23 "gvisor.dev/gvisor/pkg/context" 24 "gvisor.dev/gvisor/pkg/errors/linuxerr" 25 "gvisor.dev/gvisor/pkg/fspath" 26 "gvisor.dev/gvisor/pkg/sentry/kernel/auth" 27 "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" 28 "gvisor.dev/gvisor/pkg/sentry/vfs" 29 ) 30 31 // stepExistingLocked resolves rp.Component() in parent directory vfsd. 32 // 33 // stepExistingLocked is loosely analogous to fs/namei.c:walk_component(). 34 // 35 // Preconditions: 36 // - Filesystem.mu must be locked for at least reading. 37 // - !rp.Done(). 38 // 39 // Postcondition: Caller must call fs.processDeferredDecRefs*. 40 func (fs *Filesystem) stepExistingLocked(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry) (*Dentry, bool, error) { 41 if !d.isDir() { 42 return nil, false, linuxerr.ENOTDIR 43 } 44 // Directory searchable? 45 if err := d.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil { 46 return nil, false, err 47 } 48 name := rp.Component() 49 // Revalidation must be skipped if name is "." or ".."; d or its parent 50 // respectively can't be expected to transition from invalidated back to 51 // valid, so detecting invalidation and retrying would loop forever. This 52 // is consistent with Linux: fs/namei.c:walk_component() => lookup_fast() 53 // calls d_revalidate(), but walk_component() => handle_dots() does not. 54 if name == "." { 55 rp.Advance() 56 return d, false, nil 57 } 58 if name == ".." { 59 if isRoot, err := rp.CheckRoot(ctx, d.VFSDentry()); err != nil { 60 return nil, false, err 61 } else if isRoot || d.parent.Load() == nil { 62 rp.Advance() 63 return d, false, nil 64 } 65 if err := rp.CheckMount(ctx, d.Parent().VFSDentry()); err != nil { 66 return nil, false, err 67 } 68 rp.Advance() 69 return d.parent.Load(), false, nil 70 } 71 if len(name) > linux.NAME_MAX { 72 return nil, false, linuxerr.ENAMETOOLONG 73 } 74 next, err := fs.revalidateChildLocked(ctx, rp.VirtualFilesystem(), d, name) 75 if err != nil { 76 return nil, false, err 77 } 78 if err := rp.CheckMount(ctx, next.VFSDentry()); err != nil { 79 return nil, false, err 80 } 81 // Resolve any symlink at current path component. 82 if rp.ShouldFollowSymlink() && next.isSymlink() { 83 targetVD, targetPathname, err := next.inode.Getlink(ctx, rp.Mount()) 84 if err != nil { 85 return nil, false, err 86 } 87 if targetVD.Ok() { 88 followedTarget, err := rp.HandleJump(targetVD) 89 fs.deferDecRefVD(ctx, targetVD) 90 return d, followedTarget, err 91 } 92 followedSymlink, err := rp.HandleSymlink(targetPathname) 93 return d, followedSymlink, err 94 } 95 rp.Advance() 96 return next, false, nil 97 } 98 99 // revalidateChildLocked is called to look up the child of parent named name, 100 // while verifying that any cached lookups are still correct. 101 // 102 // Preconditions: 103 // - Filesystem.mu must be locked for at least reading. 104 // - parent.isDir(). 105 // - name is not "." or "..". 106 // 107 // Postconditions: Caller must call fs.processDeferredDecRefs*. 108 func (fs *Filesystem) revalidateChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, parent *Dentry, name string) (*Dentry, error) { 109 parent.dirMu.Lock() 110 defer parent.dirMu.Unlock() // may be temporarily unlocked and re-locked below 111 child := parent.children[name] 112 for child != nil { 113 // Cached dentry exists, revalidate. 114 if child.inode.Valid(ctx, parent, name) { 115 break 116 } 117 delete(parent.children, child.name) 118 parent.dirMu.Unlock() 119 fs.invalidateRemovedChildLocked(ctx, vfsObj, child) 120 parent.dirMu.Lock() 121 // Check for concurrent insertion of a new cached dentry. 122 child = parent.children[name] 123 } 124 if child == nil { 125 // Dentry isn't cached; it either doesn't exist or failed revalidation. 126 // Attempt to resolve it via Lookup. 127 childInode, err := parent.inode.Lookup(ctx, name) 128 if err != nil { 129 return nil, err 130 } 131 var newChild Dentry 132 newChild.Init(fs, childInode) // childInode's ref is transferred to newChild. 133 parent.insertChildLocked(name, &newChild) 134 child = &newChild 135 136 // Drop the ref on newChild. This will cause the dentry to get pruned 137 // from the dentry tree by the end of current filesystem operation 138 // (before returning to the VFS layer) if another ref is not picked on 139 // this dentry. 140 if !childInode.Keep() { 141 fs.deferDecRef(&newChild) 142 } 143 } 144 return child, nil 145 } 146 147 // Preconditions: 148 // - Filesystem.mu must be locked for at least reading. 149 // - d has been removed from its parent.children. 150 // 151 // Postconditions: Caller must call fs.processDeferredDecRefs*. 152 func (fs *Filesystem) invalidateRemovedChildLocked(ctx context.Context, vfsObj *vfs.VirtualFilesystem, d *Dentry) { 153 toInvalidate := []*Dentry{d} 154 for len(toInvalidate) != 0 { 155 d := toInvalidate[len(toInvalidate)-1] 156 toInvalidate = toInvalidate[:len(toInvalidate)-1] 157 158 if d.inode.Keep() { 159 fs.deferDecRef(d) 160 } 161 rcs := vfsObj.InvalidateDentry(ctx, d.VFSDentry()) 162 for _, rc := range rcs { 163 fs.deferDecRef(rc) 164 } 165 166 if d.isDir() { 167 d.dirMu.Lock() 168 for name, child := range d.children { 169 toInvalidate = append(toInvalidate, child) 170 delete(d.children, name) 171 } 172 d.dirMu.Unlock() 173 } 174 } 175 } 176 177 // walkExistingLocked resolves rp to an existing file. 178 // 179 // walkExistingLocked is loosely analogous to Linux's 180 // fs/namei.c:path_lookupat(). 181 // 182 // Preconditions: Filesystem.mu must be locked for at least reading. 183 // 184 // Postconditions: Caller must call fs.processDeferredDecRefs*. 185 func (fs *Filesystem) walkExistingLocked(ctx context.Context, rp *vfs.ResolvingPath) (*Dentry, error) { 186 d := rp.Start().Impl().(*Dentry) 187 for !rp.Done() { 188 var err error 189 d, _, err = fs.stepExistingLocked(ctx, rp, d) 190 if err != nil { 191 return nil, err 192 } 193 } 194 if rp.MustBeDir() && !d.isDir() { 195 return nil, linuxerr.ENOTDIR 196 } 197 return d, nil 198 } 199 200 // walkParentDirLocked resolves all but the last path component of rp to an 201 // existing directory. It does not check that the returned directory is 202 // searchable by the provider of rp. 203 // 204 // walkParentDirLocked is loosely analogous to Linux's 205 // fs/namei.c:path_parentat(). 206 // 207 // Preconditions: 208 // - Filesystem.mu must be locked for at least reading. 209 // - !rp.Done(). 210 // 211 // Postconditions: Caller must call fs.processDeferredDecRefs*. 212 func (fs *Filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry) (*Dentry, error) { 213 for !rp.Final() { 214 var err error 215 d, _, err = fs.stepExistingLocked(ctx, rp, d) 216 if err != nil { 217 return nil, err 218 } 219 } 220 if !d.isDir() { 221 return nil, linuxerr.ENOTDIR 222 } 223 return d, nil 224 } 225 226 // checkCreateLocked checks that a file named rp.Component() may be created in 227 // directory parent, then returns rp.Component(). 228 // 229 // Preconditions: 230 // - Filesystem.mu must be locked for at least reading. 231 // - isDir(parentInode) == true. 232 func checkCreateLocked(ctx context.Context, creds *auth.Credentials, name string, parent *Dentry) error { 233 // Order of checks is important. First check if parent directory can be 234 // executed, then check for existence, and lastly check if mount is writable. 235 if err := parent.inode.CheckPermissions(ctx, creds, vfs.MayExec); err != nil { 236 return err 237 } 238 if name == "." || name == ".." { 239 return linuxerr.EEXIST 240 } 241 if len(name) > linux.NAME_MAX { 242 return linuxerr.ENAMETOOLONG 243 } 244 if _, ok := parent.children[name]; ok { 245 return linuxerr.EEXIST 246 } 247 if parent.VFSDentry().IsDead() { 248 return linuxerr.ENOENT 249 } 250 if err := parent.inode.CheckPermissions(ctx, creds, vfs.MayWrite); err != nil { 251 return err 252 } 253 return nil 254 } 255 256 // checkDeleteLocked checks that the file represented by vfsd may be deleted. 257 // 258 // Preconditions: Filesystem.mu must be locked for at least reading. 259 func checkDeleteLocked(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry) error { 260 parent := d.parent.Load() 261 if parent == nil { 262 return linuxerr.EBUSY 263 } 264 if parent.vfsd.IsDead() { 265 return linuxerr.ENOENT 266 } 267 if d.vfsd.IsDead() { 268 // This implies a duplicate unlink on an orphaned dentry, where the path 269 // resolution was successful. This is possible when the orphan is 270 // replaced by a new node of the same name (so the path resolution 271 // succeeds), and the orphan is unlinked again through a dirfd using 272 // unlinkat(2) (so the unlink refers to the orphan and not the new 273 // node). See Linux, fs/namei.c:do_rmdir(). 274 return linuxerr.EINVAL 275 } 276 if err := parent.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { 277 return err 278 } 279 return nil 280 } 281 282 // Release implements vfs.FilesystemImpl.Release. 283 func (fs *Filesystem) Release(ctx context.Context) { 284 root := fs.root 285 if root == nil { 286 return 287 } 288 fs.mu.Lock() 289 root.releaseKeptDentriesLocked(ctx) 290 for fs.cachedDentriesLen != 0 { 291 fs.evictCachedDentryLocked(ctx) 292 } 293 fs.mu.Unlock() 294 // Drop ref acquired in Dentry.InitRoot(). 295 root.DecRef(ctx) 296 } 297 298 // releaseKeptDentriesLocked recursively drops all dentry references created by 299 // Lookup when Dentry.inode.Keep() is true. 300 // 301 // Precondition: Filesystem.mu is held. 302 func (d *Dentry) releaseKeptDentriesLocked(ctx context.Context) { 303 if d.inode.Keep() && d != d.fs.root { 304 d.decRefLocked(ctx) 305 } 306 307 if d.isDir() { 308 var children []*Dentry 309 d.dirMu.Lock() 310 for _, child := range d.children { 311 children = append(children, child) 312 } 313 d.dirMu.Unlock() 314 for _, child := range children { 315 child.releaseKeptDentriesLocked(ctx) 316 } 317 } 318 } 319 320 // Sync implements vfs.FilesystemImpl.Sync. 321 func (fs *Filesystem) Sync(ctx context.Context) error { 322 // All filesystem state is in-memory. 323 return nil 324 } 325 326 // AccessAt implements vfs.Filesystem.Impl.AccessAt. 327 func (fs *Filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error { 328 fs.mu.RLock() 329 defer fs.processDeferredDecRefs(ctx) 330 defer fs.mu.RUnlock() 331 332 d, err := fs.walkExistingLocked(ctx, rp) 333 if err != nil { 334 return err 335 } 336 if err := d.inode.CheckPermissions(ctx, creds, ats); err != nil { 337 return err 338 } 339 if ats.MayWrite() && rp.Mount().ReadOnly() { 340 return linuxerr.EROFS 341 } 342 return nil 343 } 344 345 // GetDentryAt implements vfs.FilesystemImpl.GetDentryAt. 346 func (fs *Filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { 347 fs.mu.RLock() 348 defer fs.processDeferredDecRefs(ctx) 349 defer fs.mu.RUnlock() 350 d, err := fs.walkExistingLocked(ctx, rp) 351 if err != nil { 352 return nil, err 353 } 354 355 if opts.CheckSearchable { 356 if !d.isDir() { 357 return nil, linuxerr.ENOTDIR 358 } 359 if err := d.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil { 360 return nil, err 361 } 362 } 363 vfsd := d.VFSDentry() 364 vfsd.IncRef() // Ownership transferred to caller. 365 return vfsd, nil 366 } 367 368 // GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt. 369 func (fs *Filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) { 370 fs.mu.RLock() 371 defer fs.processDeferredDecRefs(ctx) 372 defer fs.mu.RUnlock() 373 d, err := fs.walkParentDirLocked(ctx, rp, rp.Start().Impl().(*Dentry)) 374 if err != nil { 375 return nil, err 376 } 377 d.IncRef() // Ownership transferred to caller. 378 return d.VFSDentry(), nil 379 } 380 381 // LinkAt implements vfs.FilesystemImpl.LinkAt. 382 func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error { 383 if rp.Done() { 384 return linuxerr.EEXIST 385 } 386 fs.mu.Lock() 387 defer fs.processDeferredDecRefs(ctx) 388 defer fs.mu.Unlock() 389 parent, err := fs.walkParentDirLocked(ctx, rp, rp.Start().Impl().(*Dentry)) 390 if err != nil { 391 return err 392 } 393 394 if rp.Mount() != vd.Mount() { 395 return linuxerr.EXDEV 396 } 397 inode := vd.Dentry().Impl().(*Dentry).Inode() 398 if inode.Mode().IsDir() { 399 return linuxerr.EPERM 400 } 401 if err := vfs.MayLink(rp.Credentials(), inode.Mode(), inode.UID(), inode.GID()); err != nil { 402 return err 403 } 404 parent.dirMu.Lock() 405 defer parent.dirMu.Unlock() 406 pc := rp.Component() 407 if err := checkCreateLocked(ctx, rp.Credentials(), pc, parent); err != nil { 408 return err 409 } 410 if rp.MustBeDir() { 411 return linuxerr.ENOENT 412 } 413 if err := rp.Mount().CheckBeginWrite(); err != nil { 414 return err 415 } 416 defer rp.Mount().EndWrite() 417 418 childI, err := parent.inode.NewLink(ctx, pc, inode) 419 if err != nil { 420 return err 421 } 422 parent.inode.Watches().Notify(ctx, pc, linux.IN_CREATE, 0, vfs.InodeEvent, false /* unlinked */) 423 inode.Watches().Notify(ctx, "", linux.IN_ATTRIB, 0, vfs.InodeEvent, false /* unlinked */) 424 var child Dentry 425 child.Init(fs, childI) 426 parent.insertChildLocked(pc, &child) 427 return nil 428 } 429 430 // MkdirAt implements vfs.FilesystemImpl.MkdirAt. 431 func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error { 432 if rp.Done() { 433 return linuxerr.EEXIST 434 } 435 fs.mu.Lock() 436 defer fs.processDeferredDecRefs(ctx) 437 defer fs.mu.Unlock() 438 parent, err := fs.walkParentDirLocked(ctx, rp, rp.Start().Impl().(*Dentry)) 439 if err != nil { 440 return err 441 } 442 443 parent.dirMu.Lock() 444 defer parent.dirMu.Unlock() 445 pc := rp.Component() 446 if err := checkCreateLocked(ctx, rp.Credentials(), pc, parent); err != nil { 447 return err 448 } 449 if err := rp.Mount().CheckBeginWrite(); err != nil { 450 return err 451 } 452 defer rp.Mount().EndWrite() 453 childI, err := parent.inode.NewDir(ctx, pc, opts) 454 if err != nil { 455 if !opts.ForSyntheticMountpoint || linuxerr.Equals(linuxerr.EEXIST, err) { 456 return err 457 } 458 childI = newSyntheticDirectory(ctx, rp.Credentials(), opts.Mode) 459 } 460 var child Dentry 461 child.Init(fs, childI) 462 parent.inode.Watches().Notify(ctx, pc, linux.IN_CREATE|linux.IN_ISDIR, 0, vfs.InodeEvent, false /* unlinked */) 463 parent.insertChildLocked(pc, &child) 464 return nil 465 } 466 467 // MknodAt implements vfs.FilesystemImpl.MknodAt. 468 func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error { 469 if rp.Done() { 470 return linuxerr.EEXIST 471 } 472 fs.mu.Lock() 473 defer fs.processDeferredDecRefs(ctx) 474 defer fs.mu.Unlock() 475 parent, err := fs.walkParentDirLocked(ctx, rp, rp.Start().Impl().(*Dentry)) 476 if err != nil { 477 return err 478 } 479 480 parent.dirMu.Lock() 481 defer parent.dirMu.Unlock() 482 pc := rp.Component() 483 if err := checkCreateLocked(ctx, rp.Credentials(), pc, parent); err != nil { 484 return err 485 } 486 if rp.MustBeDir() { 487 return linuxerr.ENOENT 488 } 489 if err := rp.Mount().CheckBeginWrite(); err != nil { 490 return err 491 } 492 defer rp.Mount().EndWrite() 493 newI, err := parent.inode.NewNode(ctx, pc, opts) 494 if err != nil { 495 return err 496 } 497 parent.inode.Watches().Notify(ctx, pc, linux.IN_CREATE, 0, vfs.InodeEvent, false /* unlinked */) 498 var newD Dentry 499 newD.Init(fs, newI) 500 parent.insertChildLocked(pc, &newD) 501 return nil 502 } 503 504 // OpenAt implements vfs.FilesystemImpl.OpenAt. 505 func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) { 506 ats := vfs.AccessTypesForOpenFlags(&opts) 507 508 // Do not create new file. 509 if opts.Flags&linux.O_CREAT == 0 { 510 fs.mu.RLock() 511 defer fs.processDeferredDecRefs(ctx) 512 d, err := fs.walkExistingLocked(ctx, rp) 513 if err != nil { 514 fs.mu.RUnlock() 515 return nil, err 516 } 517 if err := d.inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil { 518 fs.mu.RUnlock() 519 return nil, err 520 } 521 // Open may block so we need to unlock fs.mu. IncRef d to prevent 522 // its destruction while fs.mu is unlocked. 523 d.IncRef() 524 fs.mu.RUnlock() 525 fd, err := d.inode.Open(ctx, rp, d, opts) 526 d.DecRef(ctx) 527 return fd, err 528 } 529 530 // May create new file. 531 mustCreate := opts.Flags&linux.O_EXCL != 0 532 start := rp.Start().Impl().(*Dentry) 533 fs.mu.Lock() 534 unlocked := false 535 unlock := func() { 536 if !unlocked { 537 fs.mu.Unlock() 538 unlocked = true 539 } 540 } 541 // Process all to-be-decref'd dentries at the end at once. 542 // Since we defer unlock() AFTER this, fs.mu is guaranteed to be unlocked 543 // when this is executed. 544 defer fs.processDeferredDecRefs(ctx) 545 defer unlock() 546 if rp.Done() { 547 if rp.MustBeDir() { 548 return nil, linuxerr.EISDIR 549 } 550 if mustCreate { 551 return nil, linuxerr.EEXIST 552 } 553 if err := start.inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil { 554 return nil, err 555 } 556 // Open may block so we need to unlock fs.mu. IncRef d to prevent 557 // its destruction while fs.mu is unlocked. 558 start.IncRef() 559 unlock() 560 fd, err := start.inode.Open(ctx, rp, start, opts) 561 start.DecRef(ctx) 562 return fd, err 563 } 564 afterTrailingSymlink: 565 parent, err := fs.walkParentDirLocked(ctx, rp, start) 566 if err != nil { 567 return nil, err 568 } 569 // Check for search permission in the parent directory. 570 if err := parent.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayExec); err != nil { 571 return nil, err 572 } 573 // Reject attempts to open directories with O_CREAT. 574 if rp.MustBeDir() { 575 return nil, linuxerr.EISDIR 576 } 577 pc := rp.Component() 578 if pc == "." || pc == ".." { 579 return nil, linuxerr.EISDIR 580 } 581 if len(pc) > linux.NAME_MAX { 582 return nil, linuxerr.ENAMETOOLONG 583 } 584 if parent.VFSDentry().IsDead() { 585 return nil, linuxerr.ENOENT 586 } 587 // Determine whether or not we need to create a file. 588 child, followedSymlink, err := fs.stepExistingLocked(ctx, rp, parent) 589 if followedSymlink { 590 if mustCreate { 591 // EEXIST must be returned if an existing symlink is opened with O_EXCL. 592 return nil, linuxerr.EEXIST 593 } 594 if err != nil { 595 // If followedSymlink && err != nil, then this symlink resolution error 596 // must be handled by the VFS layer. 597 return nil, err 598 } 599 start = parent 600 goto afterTrailingSymlink 601 } 602 if linuxerr.Equals(linuxerr.ENOENT, err) { 603 // Already checked for searchability above; now check for writability. 604 if err := parent.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil { 605 return nil, err 606 } 607 if err := rp.Mount().CheckBeginWrite(); err != nil { 608 return nil, err 609 } 610 defer rp.Mount().EndWrite() 611 // Create and open the child. 612 childI, err := parent.inode.NewFile(ctx, pc, opts) 613 if err != nil { 614 return nil, err 615 } 616 var child Dentry 617 child.Init(fs, childI) 618 parent.insertChild(pc, &child) 619 // Open may block so we need to unlock fs.mu. IncRef child to prevent 620 // its destruction while fs.mu is unlocked. 621 child.IncRef() 622 unlock() 623 parent.inode.Watches().Notify(ctx, pc, linux.IN_CREATE, 0, vfs.PathEvent, false /* unlinked */) 624 fd, err := child.inode.Open(ctx, rp, &child, opts) 625 child.DecRef(ctx) 626 return fd, err 627 } 628 if err != nil { 629 return nil, err 630 } 631 // Open existing file or follow symlink. 632 if mustCreate { 633 return nil, linuxerr.EEXIST 634 } 635 if rp.MustBeDir() && !child.isDir() { 636 return nil, linuxerr.ENOTDIR 637 } 638 if err := child.inode.CheckPermissions(ctx, rp.Credentials(), ats); err != nil { 639 return nil, err 640 } 641 if child.isDir() { 642 // Can't open directories with O_CREAT. 643 if opts.Flags&linux.O_CREAT != 0 { 644 return nil, linuxerr.EISDIR 645 } 646 // Can't open directories writably. 647 if ats&vfs.MayWrite != 0 { 648 return nil, linuxerr.EISDIR 649 } 650 if opts.Flags&linux.O_DIRECT != 0 { 651 return nil, linuxerr.EINVAL 652 } 653 } 654 // Open may block so we need to unlock fs.mu. IncRef child to prevent 655 // its destruction while fs.mu is unlocked. 656 child.IncRef() 657 unlock() 658 fd, err := child.inode.Open(ctx, rp, child, opts) 659 child.DecRef(ctx) 660 return fd, err 661 } 662 663 // ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt. 664 func (fs *Filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) { 665 defer fs.processDeferredDecRefs(ctx) 666 667 fs.mu.RLock() 668 d, err := fs.walkExistingLocked(ctx, rp) 669 if err != nil { 670 fs.mu.RUnlock() 671 return "", err 672 } 673 if !d.isSymlink() { 674 fs.mu.RUnlock() 675 return "", linuxerr.EINVAL 676 } 677 678 // Inode.Readlink() cannot be called holding fs locks. 679 d.IncRef() 680 defer d.DecRef(ctx) 681 fs.mu.RUnlock() 682 683 return d.inode.Readlink(ctx, rp.Mount()) 684 } 685 686 // RenameAt implements vfs.FilesystemImpl.RenameAt. 687 func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error { 688 fs.mu.Lock() 689 defer fs.processDeferredDecRefs(ctx) 690 defer fs.mu.Unlock() 691 692 // Resolve the destination directory first to verify that it's on this 693 // Mount. 694 dstDir, err := fs.walkParentDirLocked(ctx, rp, rp.Start().Impl().(*Dentry)) 695 if err != nil { 696 return err 697 } 698 699 // Only RENAME_NOREPLACE is supported. 700 if opts.Flags&^linux.RENAME_NOREPLACE != 0 { 701 return linuxerr.EINVAL 702 } 703 noReplace := opts.Flags&linux.RENAME_NOREPLACE != 0 704 705 mnt := rp.Mount() 706 if mnt != oldParentVD.Mount() { 707 return linuxerr.EXDEV 708 } 709 if err := mnt.CheckBeginWrite(); err != nil { 710 return err 711 } 712 defer mnt.EndWrite() 713 oldParentDir := oldParentVD.Dentry().Impl().(*Dentry).Inode() 714 if err := oldParentDir.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { 715 return err 716 } 717 if err := dstDir.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { 718 return err 719 } 720 721 srcDirVFSD := oldParentVD.Dentry() 722 srcDir := srcDirVFSD.Impl().(*Dentry) 723 src, err := fs.revalidateChildLocked(ctx, rp.VirtualFilesystem(), srcDir, oldName) 724 if err != nil { 725 return err 726 } 727 728 // Can we remove the src dentry? 729 if err := checkDeleteLocked(ctx, rp, src); err != nil { 730 return err 731 } 732 733 // Can we create the dst dentry? 734 var dst *Dentry 735 newName := rp.Component() 736 if newName == "." || newName == ".." { 737 if noReplace { 738 return linuxerr.EEXIST 739 } 740 return linuxerr.EBUSY 741 } 742 if len(newName) > linux.NAME_MAX { 743 return linuxerr.ENAMETOOLONG 744 } 745 746 err = checkCreateLocked(ctx, rp.Credentials(), newName, dstDir) 747 switch { 748 case err == nil: 749 // Ok, continue with rename as replacement. 750 case linuxerr.Equals(linuxerr.EEXIST, err): 751 if noReplace { 752 // Won't overwrite existing node since RENAME_NOREPLACE was requested. 753 return linuxerr.EEXIST 754 } 755 dst = dstDir.children[newName] 756 if dst == nil { 757 panic(fmt.Sprintf("Child %q for parent Dentry %+v disappeared inside atomic section?", newName, dstDir)) 758 } 759 default: 760 return err 761 } 762 763 if srcDir == dstDir && oldName == newName { 764 return nil 765 } 766 767 var dstVFSD *vfs.Dentry 768 if dst != nil { 769 dstVFSD = dst.VFSDentry() 770 } 771 772 mntns := vfs.MountNamespaceFromContext(ctx) 773 defer mntns.DecRef(ctx) 774 virtfs := rp.VirtualFilesystem() 775 776 // We can't deadlock here due to lock ordering because we're protected from 777 // concurrent renames by fs.mu held for writing. 778 srcDir.dirMu.Lock() 779 defer srcDir.dirMu.Unlock() 780 if srcDir != dstDir { 781 dstDir.dirMu.Lock() 782 defer dstDir.dirMu.Unlock() 783 } 784 785 srcVFSD := src.VFSDentry() 786 if err := virtfs.PrepareRenameDentry(mntns, srcVFSD, dstVFSD); err != nil { 787 return err 788 } 789 err = srcDir.inode.Rename(ctx, src.name, newName, src.inode, dstDir.inode) 790 if err != nil { 791 virtfs.AbortRenameDentry(srcVFSD, dstVFSD) 792 return err 793 } 794 delete(srcDir.children, src.name) 795 if srcDir != dstDir { 796 fs.deferDecRef(srcDir) // child (src) drops ref on old parent. 797 dstDir.IncRef() // child (src) takes a ref on the new parent. 798 } 799 src.parent.Store(dstDir) 800 src.name = newName 801 if dstDir.children == nil { 802 dstDir.children = make(map[string]*Dentry) 803 } 804 replaced := dstDir.children[newName] 805 dstDir.children[newName] = src 806 var replaceVFSD *vfs.Dentry 807 if replaced != nil { 808 // deferDecRef so that fs.mu and dstDir.mu are unlocked by then. 809 fs.deferDecRef(replaced) 810 replaceVFSD = replaced.VFSDentry() 811 replaced.setDeleted() 812 } 813 vfs.InotifyRename(ctx, src.inode.Watches(), srcDir.inode.Watches(), dstDir.inode.Watches(), oldName, newName, src.isDir()) 814 for _, rc := range virtfs.CommitRenameReplaceDentry(ctx, srcVFSD, replaceVFSD) { // +checklocksforce: to may be nil, that's okay. 815 fs.deferDecRef(rc) 816 } 817 return nil 818 } 819 820 // RmdirAt implements vfs.FilesystemImpl.RmdirAt. 821 func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error { 822 fs.mu.Lock() 823 defer fs.processDeferredDecRefs(ctx) 824 defer fs.mu.Unlock() 825 parent, err := fs.walkParentDirLocked(ctx, rp, rp.Start().Impl().(*Dentry)) 826 if err != nil { 827 return err 828 } 829 if err := parent.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { 830 return err 831 } 832 if err := rp.Mount().CheckBeginWrite(); err != nil { 833 return err 834 } 835 defer rp.Mount().EndWrite() 836 name := rp.Component() 837 if name == "." { 838 return linuxerr.EINVAL 839 } 840 if name == ".." { 841 return linuxerr.ENOTEMPTY 842 } 843 child, ok := parent.children[name] 844 if !ok { 845 return linuxerr.ENOENT 846 } 847 if err := checkDeleteLocked(ctx, rp, child); err != nil { 848 return err 849 } 850 if err := vfs.CheckDeleteSticky( 851 rp.Credentials(), 852 linux.FileMode(parent.inode.Mode()), 853 auth.KUID(parent.inode.UID()), 854 auth.KUID(child.inode.UID()), 855 auth.KGID(child.inode.GID()), 856 ); err != nil { 857 return err 858 } 859 if !child.isDir() { 860 return linuxerr.ENOTDIR 861 } 862 if child.inode.HasChildren() { 863 return linuxerr.ENOTEMPTY 864 } 865 virtfs := rp.VirtualFilesystem() 866 parent.dirMu.Lock() 867 defer parent.dirMu.Unlock() 868 869 mntns := vfs.MountNamespaceFromContext(ctx) 870 defer mntns.DecRef(ctx) 871 vfsd := child.VFSDentry() 872 if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil { 873 return err // +checklocksforce: vfsd is not locked. 874 } 875 876 if err := parent.inode.RmDir(ctx, child.name, child.inode); err != nil { 877 virtfs.AbortDeleteDentry(vfsd) 878 return err 879 } 880 delete(parent.children, child.name) 881 parent.inode.Watches().Notify(ctx, child.name, linux.IN_DELETE|linux.IN_ISDIR, 0, vfs.InodeEvent, true /* unlinked */) 882 // Defer decref so that fs.mu and parentDentry.dirMu are unlocked by then. 883 fs.deferDecRef(child) 884 rcs := virtfs.CommitDeleteDentry(ctx, vfsd) 885 for _, rc := range rcs { 886 fs.deferDecRef(rc) 887 } 888 child.setDeleted() 889 return nil 890 } 891 892 // SetStatAt implements vfs.FilesystemImpl.SetStatAt. 893 func (fs *Filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { 894 fs.mu.RLock() 895 defer fs.processDeferredDecRefs(ctx) 896 d, err := fs.walkExistingLocked(ctx, rp) 897 if err != nil { 898 fs.mu.RUnlock() 899 return err 900 } 901 if opts.Stat.Mask == 0 { 902 fs.mu.RUnlock() 903 return nil 904 } 905 err = d.inode.SetStat(ctx, fs.VFSFilesystem(), rp.Credentials(), opts) 906 fs.mu.RUnlock() 907 if err != nil { 908 return err 909 } 910 if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 { 911 d.InotifyWithParent(ctx, ev, 0, vfs.InodeEvent) 912 } 913 return nil 914 } 915 916 // StatAt implements vfs.FilesystemImpl.StatAt. 917 func (fs *Filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) { 918 fs.mu.RLock() 919 defer fs.processDeferredDecRefs(ctx) 920 defer fs.mu.RUnlock() 921 d, err := fs.walkExistingLocked(ctx, rp) 922 if err != nil { 923 return linux.Statx{}, err 924 } 925 return d.inode.Stat(ctx, fs.VFSFilesystem(), opts) 926 } 927 928 // StatFSAt implements vfs.FilesystemImpl.StatFSAt. 929 func (fs *Filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { 930 fs.mu.RLock() 931 defer fs.processDeferredDecRefs(ctx) 932 defer fs.mu.RUnlock() 933 d, err := fs.walkExistingLocked(ctx, rp) 934 if err != nil { 935 return linux.Statfs{}, err 936 } 937 return d.inode.StatFS(ctx, fs.VFSFilesystem()) 938 } 939 940 // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. 941 func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error { 942 if rp.Done() { 943 return linuxerr.EEXIST 944 } 945 fs.mu.Lock() 946 defer fs.processDeferredDecRefs(ctx) 947 defer fs.mu.Unlock() 948 parent, err := fs.walkParentDirLocked(ctx, rp, rp.Start().Impl().(*Dentry)) 949 if err != nil { 950 return err 951 } 952 parent.dirMu.Lock() 953 defer parent.dirMu.Unlock() 954 955 pc := rp.Component() 956 if err := checkCreateLocked(ctx, rp.Credentials(), pc, parent); err != nil { 957 return err 958 } 959 if rp.MustBeDir() { 960 return linuxerr.ENOENT 961 } 962 if err := rp.Mount().CheckBeginWrite(); err != nil { 963 return err 964 } 965 defer rp.Mount().EndWrite() 966 childI, err := parent.inode.NewSymlink(ctx, pc, target) 967 if err != nil { 968 return err 969 } 970 parent.inode.Watches().Notify(ctx, pc, linux.IN_CREATE, 0, vfs.InodeEvent, false /* unlinked */) 971 var child Dentry 972 child.Init(fs, childI) 973 parent.insertChildLocked(pc, &child) 974 return nil 975 } 976 977 // UnlinkAt implements vfs.FilesystemImpl.UnlinkAt. 978 func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error { 979 fs.mu.Lock() 980 defer fs.processDeferredDecRefs(ctx) 981 defer fs.mu.Unlock() 982 983 d, err := fs.walkExistingLocked(ctx, rp) 984 if err != nil { 985 return err 986 } 987 if err := rp.Mount().CheckBeginWrite(); err != nil { 988 return err 989 } 990 defer rp.Mount().EndWrite() 991 if err := checkDeleteLocked(ctx, rp, d); err != nil { 992 return err 993 } 994 if d.isDir() { 995 return linuxerr.EISDIR 996 } 997 virtfs := rp.VirtualFilesystem() 998 parentDentry := d.parent.Load() 999 parentDentry.dirMu.Lock() 1000 defer parentDentry.dirMu.Unlock() 1001 mntns := vfs.MountNamespaceFromContext(ctx) 1002 defer mntns.DecRef(ctx) 1003 vfsd := d.VFSDentry() 1004 if err := virtfs.PrepareDeleteDentry(mntns, vfsd); err != nil { 1005 return err 1006 } 1007 if err := parentDentry.inode.Unlink(ctx, d.name, d.inode); err != nil { 1008 virtfs.AbortDeleteDentry(vfsd) 1009 return err 1010 } 1011 delete(parentDentry.children, d.name) 1012 vfs.InotifyRemoveChild(ctx, d.inode.Watches(), parentDentry.inode.Watches(), d.name) 1013 // Defer decref so that fs.mu and parentDentry.dirMu are unlocked by then. 1014 fs.deferDecRef(d) 1015 rcs := virtfs.CommitDeleteDentry(ctx, vfsd) 1016 for _, rc := range rcs { 1017 fs.deferDecRef(rc) 1018 } 1019 d.setDeleted() 1020 return nil 1021 } 1022 1023 // BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt. 1024 func (fs *Filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) { 1025 fs.mu.RLock() 1026 defer fs.processDeferredDecRefs(ctx) 1027 defer fs.mu.RUnlock() 1028 d, err := fs.walkExistingLocked(ctx, rp) 1029 if err != nil { 1030 return nil, err 1031 } 1032 if err := d.inode.CheckPermissions(ctx, rp.Credentials(), vfs.MayWrite); err != nil { 1033 return nil, err 1034 } 1035 return nil, linuxerr.ECONNREFUSED 1036 } 1037 1038 // ListXattrAt implements vfs.FilesystemImpl.ListXattrAt. 1039 func (fs *Filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { 1040 fs.mu.RLock() 1041 defer fs.processDeferredDecRefs(ctx) 1042 defer fs.mu.RUnlock() 1043 _, err := fs.walkExistingLocked(ctx, rp) 1044 if err != nil { 1045 return nil, err 1046 } 1047 // kernfs currently does not support extended attributes. 1048 return nil, linuxerr.ENOTSUP 1049 } 1050 1051 // GetXattrAt implements vfs.FilesystemImpl.GetXattrAt. 1052 func (fs *Filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) { 1053 fs.mu.RLock() 1054 defer fs.processDeferredDecRefs(ctx) 1055 defer fs.mu.RUnlock() 1056 _, err := fs.walkExistingLocked(ctx, rp) 1057 if err != nil { 1058 return "", err 1059 } 1060 // kernfs currently does not support extended attributes. 1061 return "", linuxerr.ENOTSUP 1062 } 1063 1064 // SetXattrAt implements vfs.FilesystemImpl.SetXattrAt. 1065 func (fs *Filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error { 1066 fs.mu.RLock() 1067 defer fs.processDeferredDecRefs(ctx) 1068 defer fs.mu.RUnlock() 1069 _, err := fs.walkExistingLocked(ctx, rp) 1070 if err != nil { 1071 return err 1072 } 1073 // kernfs currently does not support extended attributes. 1074 return linuxerr.ENOTSUP 1075 } 1076 1077 // RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt. 1078 func (fs *Filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { 1079 fs.mu.RLock() 1080 defer fs.processDeferredDecRefs(ctx) 1081 defer fs.mu.RUnlock() 1082 _, err := fs.walkExistingLocked(ctx, rp) 1083 if err != nil { 1084 return err 1085 } 1086 // kernfs currently does not support extended attributes. 1087 return linuxerr.ENOTSUP 1088 } 1089 1090 // PrependPath implements vfs.FilesystemImpl.PrependPath. 1091 func (fs *Filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { 1092 fs.mu.RLock() 1093 defer fs.mu.RUnlock() 1094 return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*Dentry), b) 1095 } 1096 1097 func (fs *Filesystem) deferDecRefVD(ctx context.Context, vd vfs.VirtualDentry) { 1098 if d, ok := vd.Dentry().Impl().(*Dentry); ok && d.fs == fs { 1099 // The following is equivalent to vd.DecRef(ctx). This is needed 1100 // because if d belongs to this filesystem, we can not DecRef it right 1101 // away as we may be holding fs.mu. d.DecRef may acquire fs.mu. So we 1102 // defer the DecRef to when locks are dropped. 1103 vd.Mount().DecRef(ctx) 1104 fs.deferDecRef(d) 1105 } else { 1106 vd.DecRef(ctx) 1107 } 1108 } 1109 1110 // IsDescendant implements vfs.FilesystemImpl.IsDescendant. 1111 func (fs *Filesystem) IsDescendant(vfsroot, vd vfs.VirtualDentry) bool { 1112 return genericIsDescendant(vfsroot.Dentry(), vd.Dentry().Impl().(*Dentry)) 1113 }