github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/fsimpl/kernfs/kernfs.go (about) 1 // Copyright 2019 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package kernfs provides the tools to implement inode-based filesystems. 16 // Kernfs has two main features: 17 // 18 // 1. The Inode interface, which maps VFS's path-based filesystem operations to 19 // specific filesystem nodes. Kernfs uses the Inode interface to provide a 20 // blanket implementation for the vfs.FilesystemImpl. Kernfs also serves as 21 // the synchronization mechanism for all filesystem operations by holding a 22 // filesystem-wide lock across all operations. 23 // 24 // 2. Various utility types which provide generic implementations for various 25 // parts of the Inode and vfs.FileDescription interfaces. Client filesystems 26 // based on kernfs can embed the appropriate set of these to avoid having to 27 // reimplement common filesystem operations. See inode_impl_util.go and 28 // fd_impl_util.go. 29 // 30 // Reference Model: 31 // 32 // Kernfs dentries represents named pointers to inodes. Kernfs is solely 33 // responsible for maintaining and modifying its dentry tree; inode 34 // implementations can not access the tree. Dentries and inodes have 35 // independent lifetimes and reference counts. A child dentry unconditionally 36 // holds a reference on its parent directory's dentry. A dentry also holds a 37 // reference on the inode it points to (although that might not be the only 38 // reference on the inode). Due to this inodes can outlive the dentries that 39 // point to them. Multiple dentries can point to the same inode (for example, 40 // in the case of hardlinks). File descriptors hold a reference to the dentry 41 // they're opened on. 42 // 43 // Dentries are guaranteed to exist while holding Filesystem.mu for 44 // reading. Dropping dentries require holding Filesystem.mu for writing. To 45 // queue dentries for destruction from a read critical section, see 46 // Filesystem.deferDecRef. 47 // 48 // Lock ordering: 49 // 50 // kernfs.Filesystem.mu 51 // kernel.TaskSet.mu 52 // kernel.Task.mu 53 // kernfs.Dentry.dirMu 54 // vfs.VirtualFilesystem.mountMu 55 // vfs.Dentry.mu 56 // (inode implementation locks, if any) 57 // 58 // kernfs.Filesystem.deferredDecRefsMu 59 package kernfs 60 61 import ( 62 "fmt" 63 "sync/atomic" 64 65 "github.com/metacubex/gvisor/pkg/abi/linux" 66 "github.com/metacubex/gvisor/pkg/atomicbitops" 67 "github.com/metacubex/gvisor/pkg/context" 68 "github.com/metacubex/gvisor/pkg/errors/linuxerr" 69 "github.com/metacubex/gvisor/pkg/fspath" 70 "github.com/metacubex/gvisor/pkg/refs" 71 "github.com/metacubex/gvisor/pkg/sentry/kernel/auth" 72 "github.com/metacubex/gvisor/pkg/sentry/vfs" 73 "github.com/metacubex/gvisor/pkg/sync" 74 ) 75 76 // Filesystem mostly implements vfs.FilesystemImpl for a generic in-memory 77 // filesystem. Concrete implementations are expected to embed this in their own 78 // Filesystem type. 79 // 80 // +stateify savable 81 type Filesystem struct { 82 vfsfs vfs.Filesystem 83 84 deferredDecRefsMu deferredDecRefsMutex `state:"nosave"` 85 86 // deferredDecRefs is a list of dentries waiting to be DecRef()ed. This is 87 // used to defer dentry destruction until mu can be acquired for 88 // writing. Protected by deferredDecRefsMu. 89 deferredDecRefs []refs.RefCounter 90 91 // mu synchronizes the lifetime of Dentries on this filesystem. Holding it 92 // for reading guarantees continued existence of any resolved dentries, but 93 // the dentry tree may be modified. 94 // 95 // Kernfs dentries can only be DecRef()ed while holding mu for writing. For 96 // example: 97 // 98 // fs.mu.Lock() 99 // defer fs.mu.Unlock() 100 // ... 101 // dentry1.DecRef() 102 // defer dentry2.DecRef() // Ok, will run before Unlock. 103 // 104 // If discarding dentries in a read context, use Filesystem.deferDecRef. For 105 // example: 106 // 107 // fs.mu.RLock() 108 // defer fs.processDeferredDecRefs() 109 // defer fs.mu.RUnlock() 110 // ... 111 // fs.deferDecRef(dentry) 112 mu filesystemRWMutex `state:"nosave"` 113 114 // nextInoMinusOne is used to to allocate inode numbers on this 115 // filesystem. Must be accessed by atomic operations. 116 nextInoMinusOne atomicbitops.Uint64 117 118 // cachedDentries contains all dentries with 0 references. (Due to race 119 // conditions, it may also contain dentries with non-zero references.) 120 // cachedDentriesLen is the number of dentries in cachedDentries. These 121 // fields are protected by mu. 122 cachedDentries dentryList 123 cachedDentriesLen uint64 124 125 // MaxCachedDentries is the maximum size of cachedDentries. If not set, 126 // defaults to 0 and kernfs does not cache any dentries. This is immutable. 127 MaxCachedDentries uint64 128 129 // root is the root dentry of this filesystem. Note that root may be nil for 130 // filesystems on a disconnected mount without a root (e.g. pipefs, sockfs, 131 // hostfs). Filesystem holds an extra reference on root to prevent it from 132 // being destroyed prematurely. This is immutable. 133 root *Dentry 134 } 135 136 // deferDecRef defers dropping a dentry ref until the next call to 137 // processDeferredDecRefs{,Locked}. See comment on Filesystem.mu. 138 // This may be called while Filesystem.mu or Dentry.dirMu is locked. 139 func (fs *Filesystem) deferDecRef(d refs.RefCounter) { 140 fs.deferredDecRefsMu.Lock() 141 fs.deferredDecRefs = append(fs.deferredDecRefs, d) 142 fs.deferredDecRefsMu.Unlock() 143 } 144 145 // SafeDecRefFD safely DecRef the FileDescription making sure DecRef is deferred 146 // in case Filesystem.mu is held. See comment on Filesystem.mu. 147 func (fs *Filesystem) SafeDecRefFD(ctx context.Context, fd *vfs.FileDescription) { 148 if d, ok := fd.Dentry().Impl().(*Dentry); ok && d.fs == fs { 149 // Only defer if dentry belongs to this filesystem, since locks cannot cross 150 // filesystems. 151 fs.deferDecRef(fd) 152 return 153 } 154 fd.DecRef(ctx) 155 } 156 157 // SafeDecRef safely DecRef the virtual dentry making sure DecRef is deferred 158 // in case Filesystem.mu is held. See comment on Filesystem.mu. 159 func (fs *Filesystem) SafeDecRef(ctx context.Context, vd vfs.VirtualDentry) { 160 if d, ok := vd.Dentry().Impl().(*Dentry); ok && d.fs == fs { 161 // Only defer if dentry belongs to this filesystem, since locks cannot cross 162 // filesystems. 163 fs.deferDecRef(&vd) 164 return 165 } 166 vd.DecRef(ctx) 167 } 168 169 // processDeferredDecRefs calls vfs.Dentry.DecRef on all dentries in the 170 // deferredDecRefs list. See comment on Filesystem.mu. 171 // 172 // Precondition: Filesystem.mu or Dentry.dirMu must NOT be locked. 173 func (fs *Filesystem) processDeferredDecRefs(ctx context.Context) { 174 fs.deferredDecRefsMu.Lock() 175 for _, d := range fs.deferredDecRefs { 176 // Defer the DecRef call so that we are not holding deferredDecRefsMu 177 // when DecRef is called. 178 defer d.DecRef(ctx) 179 } 180 fs.deferredDecRefs = fs.deferredDecRefs[:0] // Keep slice memory for reuse. 181 fs.deferredDecRefsMu.Unlock() 182 } 183 184 // VFSFilesystem returns the generic vfs filesystem object. 185 func (fs *Filesystem) VFSFilesystem() *vfs.Filesystem { 186 return &fs.vfsfs 187 } 188 189 // NextIno allocates a new inode number on this filesystem. 190 func (fs *Filesystem) NextIno() uint64 { 191 return fs.nextInoMinusOne.Add(1) 192 } 193 194 // These consts are used in the Dentry.flags field. 195 const ( 196 // Dentry points to a directory inode. 197 dflagsIsDir = 1 << iota 198 199 // Dentry points to a symlink inode. 200 dflagsIsSymlink 201 ) 202 203 // Dentry implements vfs.DentryImpl. 204 // 205 // A kernfs dentry is similar to a dentry in a traditional filesystem: it's a 206 // named reference to an inode. A dentry generally lives as long as it's part of 207 // a mounted filesystem tree. Kernfs drops dentries once all references to them 208 // are dropped. Dentries hold a single reference to the inode they point 209 // to, and child dentries hold a reference on their parent. 210 // 211 // Must be initialized by Init prior to first use. 212 // 213 // +stateify savable 214 type Dentry struct { 215 vfsd vfs.Dentry 216 217 // refs is the reference count. When refs reaches 0, the dentry may be 218 // added to the cache or destroyed. If refs == -1, the dentry has already 219 // been destroyed. refs are allowed to go to 0 and increase again. refs is 220 // accessed using atomic memory operations. 221 refs atomicbitops.Int64 222 223 // fs is the owning filesystem. fs is immutable. 224 fs *Filesystem 225 226 // flags caches useful information about the dentry from the inode. See the 227 // dflags* consts above. 228 flags atomicbitops.Uint32 229 230 parent atomic.Pointer[Dentry] `state:".(*Dentry)"` 231 232 name string 233 234 // If cached is true, dentryEntry links dentry into 235 // Filesystem.cachedDentries. cached and dentryEntry are protected by 236 // Filesystem.mu. 237 cached bool 238 dentryEntry 239 240 // dirMu protects children and the names of child Dentries. 241 // 242 // Note that holding fs.mu for writing is not sufficient; 243 // revalidateChildLocked(), which is a very hot path, may modify children with 244 // fs.mu acquired for reading only. 245 dirMu sync.Mutex `state:"nosave"` 246 children map[string]*Dentry 247 248 inode Inode 249 250 // If deleted is non-zero, the file represented by this dentry has been 251 // deleted. deleted is accessed using atomic memory operations. 252 deleted atomicbitops.Uint32 253 } 254 255 // IncRef implements vfs.DentryImpl.IncRef. 256 func (d *Dentry) IncRef() { 257 // d.refs may be 0 if d.fs.mu is locked, which serializes against 258 // d.cacheLocked(). 259 r := d.refs.Add(1) 260 if d.LogRefs() { 261 refs.LogIncRef(d, r) 262 } 263 } 264 265 // TryIncRef implements vfs.DentryImpl.TryIncRef. 266 func (d *Dentry) TryIncRef() bool { 267 for { 268 r := d.refs.Load() 269 if r <= 0 { 270 return false 271 } 272 if d.refs.CompareAndSwap(r, r+1) { 273 if d.LogRefs() { 274 refs.LogTryIncRef(d, r+1) 275 } 276 return true 277 } 278 } 279 } 280 281 // DecRef implements vfs.DentryImpl.DecRef. 282 func (d *Dentry) DecRef(ctx context.Context) { 283 r := d.refs.Add(-1) 284 if d.LogRefs() { 285 refs.LogDecRef(d, r) 286 } 287 if r == 0 { 288 if d.inode.Anonymous() { 289 // Nothing to cache. Skip right to destroy. This avoids 290 // taking fs.mu in the DecRef() path for anonymous 291 // inodes. 292 d.destroy(ctx) 293 return 294 } 295 296 d.fs.mu.Lock() 297 defer d.fs.mu.Unlock() 298 d.cacheLocked(ctx) 299 } else if r < 0 { 300 panic("kernfs.Dentry.DecRef() called without holding a reference") 301 } 302 } 303 304 func (d *Dentry) decRefLocked(ctx context.Context) { 305 r := d.refs.Add(-1) 306 if d.LogRefs() { 307 refs.LogDecRef(d, r) 308 } 309 if r == 0 { 310 d.cacheLocked(ctx) 311 } else if r < 0 { 312 panic("kernfs.Dentry.DecRef() called without holding a reference") 313 } 314 } 315 316 // cacheLocked should be called after d's reference count becomes 0. The ref 317 // count check may happen before acquiring d.fs.mu so there might be a race 318 // condition where the ref count is increased again by the time the caller 319 // acquires d.fs.mu. This race is handled. 320 // Only reachable dentries are added to the cache. However, a dentry might 321 // become unreachable *while* it is in the cache due to invalidation. 322 // 323 // Preconditions: d.fs.mu must be locked for writing. 324 func (d *Dentry) cacheLocked(ctx context.Context) { 325 // Dentries with a non-zero reference count must be retained. (The only way 326 // to obtain a reference on a dentry with zero references is via path 327 // resolution, which requires d.fs.mu, so if d.refs is zero then it will 328 // remain zero while we hold d.fs.mu for writing.) 329 refs := d.refs.Load() 330 if refs == -1 { 331 // Dentry has already been destroyed. 332 return 333 } 334 if refs > 0 { 335 if d.cached { 336 d.fs.cachedDentries.Remove(d) 337 d.fs.cachedDentriesLen-- 338 d.cached = false 339 } 340 return 341 } 342 // If the dentry is deleted and invalidated or has no parent, then it is no 343 // longer reachable by path resolution and should be dropped immediately 344 // because it has zero references. 345 // Note that a dentry may not always have a parent; for example magic links 346 // as described in Inode.Getlink. 347 if isDead, parent := d.VFSDentry().IsDead(), d.parent.Load(); isDead || parent == nil { 348 if !isDead { 349 rcs := d.fs.vfsfs.VirtualFilesystem().InvalidateDentry(ctx, d.VFSDentry()) 350 for _, rc := range rcs { 351 d.fs.deferDecRef(rc) 352 } 353 } 354 if d.cached { 355 d.fs.cachedDentries.Remove(d) 356 d.fs.cachedDentriesLen-- 357 d.cached = false 358 } 359 if d.isDeleted() { 360 d.inode.Watches().HandleDeletion(ctx) 361 } 362 d.destroy(ctx) 363 if parent != nil { 364 parent.decRefLocked(ctx) 365 } 366 return 367 } 368 if d.VFSDentry().IsEvictable() { 369 d.evictLocked(ctx) 370 return 371 } 372 // If d is already cached, just move it to the front of the LRU. 373 if d.cached { 374 d.fs.cachedDentries.Remove(d) 375 d.fs.cachedDentries.PushFront(d) 376 return 377 } 378 // Cache the dentry, then evict the least recently used cached dentry if 379 // the cache becomes over-full. 380 d.fs.cachedDentries.PushFront(d) 381 d.fs.cachedDentriesLen++ 382 d.cached = true 383 if d.fs.cachedDentriesLen <= d.fs.MaxCachedDentries { 384 return 385 } 386 d.fs.evictCachedDentryLocked(ctx) 387 // Whether or not victim was destroyed, we brought fs.cachedDentriesLen 388 // back down to fs.opts.maxCachedDentries, so we don't loop. 389 } 390 391 // Preconditions: 392 // - fs.mu must be locked for writing. 393 func (fs *Filesystem) evictCachedDentryLocked(ctx context.Context) { 394 // Evict the least recently used dentry because cache size is greater than 395 // max cache size (configured on mount). 396 fs.cachedDentries.Back().evictLocked(ctx) 397 } 398 399 // Preconditions: 400 // - d.fs.mu must be locked for writing. 401 func (d *Dentry) evictLocked(ctx context.Context) { 402 if d == nil { 403 return 404 } 405 if d.cached { 406 d.fs.cachedDentries.Remove(d) 407 d.fs.cachedDentriesLen-- 408 d.cached = false 409 } 410 // victim.refs may have become non-zero from an earlier path resolution 411 // after it was inserted into fs.cachedDentries. 412 if d.refs.Load() == 0 { 413 if !d.vfsd.IsDead() { 414 parent := d.parent.Load() 415 parent.dirMu.Lock() 416 // Note that victim can't be a mount point (in any mount 417 // namespace), since VFS holds references on mount points. 418 rcs := d.fs.vfsfs.VirtualFilesystem().InvalidateDentry(ctx, d.VFSDentry()) 419 for _, rc := range rcs { 420 d.fs.deferDecRef(rc) 421 } 422 delete(parent.children, d.name) 423 parent.dirMu.Unlock() 424 } 425 d.destroy(ctx) 426 if parent := d.parent.Load(); parent != nil { 427 parent.decRefLocked(ctx) 428 } 429 } 430 } 431 432 // destroy destroys the dentry. 433 // 434 // Preconditions: 435 // - d.refs == 0. 436 // - d should have been removed from d.parent.children, i.e. d is not reachable 437 // by path traversal. 438 // - d.vfsd.IsDead() is true. 439 func (d *Dentry) destroy(ctx context.Context) { 440 switch refs := d.refs.Load(); refs { 441 case 0: 442 // Mark the dentry destroyed. 443 d.refs.Store(-1) 444 case -1: 445 panic("dentry.destroy() called on already destroyed dentry") 446 default: 447 panic("dentry.destroy() called with references on the dentry") 448 } 449 450 d.inode.DecRef(ctx) // IncRef from Init. 451 452 refs.Unregister(d) 453 } 454 455 // RefType implements refs.CheckedObject.Type. 456 func (d *Dentry) RefType() string { 457 return "kernfs.Dentry" 458 } 459 460 // LeakMessage implements refs.CheckedObject.LeakMessage. 461 func (d *Dentry) LeakMessage() string { 462 return fmt.Sprintf("[kernfs.Dentry %p] reference count of %d instead of -1", d, d.refs.Load()) 463 } 464 465 // LogRefs implements refs.CheckedObject.LogRefs. 466 // 467 // This should only be set to true for debugging purposes, as it can generate an 468 // extremely large amount of output and drastically degrade performance. 469 func (d *Dentry) LogRefs() bool { 470 return false 471 } 472 473 // InitRoot initializes this dentry as the root of the filesystem. 474 // 475 // Precondition: Caller must hold a reference on inode. 476 // 477 // Postcondition: Caller's reference on inode is transferred to the dentry. 478 func (d *Dentry) InitRoot(fs *Filesystem, inode Inode) { 479 d.Init(fs, inode) 480 fs.root = d 481 // Hold an extra reference on the root dentry. It is held by fs to prevent the 482 // root from being "cached" and subsequently evicted. 483 d.IncRef() 484 } 485 486 // Init initializes this dentry. 487 // 488 // Precondition: Caller must hold a reference on inode. 489 // 490 // Postcondition: Caller's reference on inode is transferred to the dentry. 491 func (d *Dentry) Init(fs *Filesystem, inode Inode) { 492 d.vfsd.Init(d) 493 d.fs = fs 494 d.inode = inode 495 d.refs.Store(1) 496 ftype := inode.Mode().FileType() 497 if ftype == linux.ModeDirectory { 498 d.flags = atomicbitops.FromUint32(d.flags.RacyLoad() | dflagsIsDir) 499 } 500 if ftype == linux.ModeSymlink { 501 d.flags = atomicbitops.FromUint32(d.flags.RacyLoad() | dflagsIsSymlink) 502 } 503 refs.Register(d) 504 } 505 506 // VFSDentry returns the generic vfs dentry for this kernfs dentry. 507 func (d *Dentry) VFSDentry() *vfs.Dentry { 508 return &d.vfsd 509 } 510 511 func (d *Dentry) isDeleted() bool { 512 return d.deleted.Load() != 0 513 } 514 515 func (d *Dentry) setDeleted() { 516 d.deleted.Store(1) 517 } 518 519 // isDir checks whether the dentry points to a directory inode. 520 func (d *Dentry) isDir() bool { 521 return d.flags.Load()&dflagsIsDir != 0 522 } 523 524 // isSymlink checks whether the dentry points to a symlink inode. 525 func (d *Dentry) isSymlink() bool { 526 return d.flags.Load()&dflagsIsSymlink != 0 527 } 528 529 // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. 530 func (d *Dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) { 531 if d.isDir() { 532 events |= linux.IN_ISDIR 533 } 534 535 // Linux always notifies the parent first. 536 537 // Don't bother looking for a parent if the inode is anonymous. It 538 // won't have one. 539 if !d.inode.Anonymous() { 540 d.fs.mu.RLock() 541 if parent := d.parent.Load(); parent != nil { 542 parent.inode.Watches().Notify(ctx, d.name, events, cookie, et, d.isDeleted()) 543 } 544 d.fs.mu.RUnlock() 545 } 546 547 d.inode.Watches().Notify(ctx, "", events, cookie, et, d.isDeleted()) 548 } 549 550 // Watches implements vfs.DentryImpl.Watches. 551 func (d *Dentry) Watches() *vfs.Watches { 552 return d.inode.Watches() 553 } 554 555 // OnZeroWatches implements vfs.Dentry.OnZeroWatches. 556 func (d *Dentry) OnZeroWatches(context.Context) {} 557 558 // insertChild inserts child into the vfs dentry cache with the given name under 559 // this dentry. This does not update the directory inode, so calling this on its 560 // own isn't sufficient to insert a child into a directory. 561 // 562 // Preconditions: 563 // - d must represent a directory inode. 564 // - d.fs.mu must be locked for at least reading. 565 func (d *Dentry) insertChild(name string, child *Dentry) { 566 d.dirMu.Lock() 567 d.insertChildLocked(name, child) 568 d.dirMu.Unlock() 569 } 570 571 // insertChildLocked is equivalent to insertChild, with additional 572 // preconditions. 573 // 574 // Preconditions: 575 // - d must represent a directory inode. 576 // - d.dirMu must be locked. 577 // - d.fs.mu must be locked for at least reading. 578 func (d *Dentry) insertChildLocked(name string, child *Dentry) { 579 if !d.isDir() { 580 panic(fmt.Sprintf("insertChildLocked called on non-directory Dentry: %+v.", d)) 581 } 582 d.IncRef() // DecRef in child's Dentry.destroy. 583 child.parent.Store(d) 584 child.name = name 585 if d.children == nil { 586 d.children = make(map[string]*Dentry) 587 } 588 d.children[name] = child 589 } 590 591 // Inode returns the dentry's inode. 592 func (d *Dentry) Inode() Inode { 593 return d.inode 594 } 595 596 // FSLocalPath returns an absolute path to d, relative to the root of its 597 // filesystem. 598 func (d *Dentry) FSLocalPath() string { 599 var b fspath.Builder 600 _ = genericPrependPath(vfs.VirtualDentry{}, nil, d, &b) 601 b.PrependByte('/') 602 return b.String() 603 } 604 605 // WalkDentryTree traverses p in the dentry tree for this filesystem. Note that 606 // this only traverses the dentry tree and is not a general path traversal. No 607 // symlinks and dynamic children are resolved, and no permission checks are 608 // performed. The caller is responsible for ensuring the returned Dentry exists 609 // for an appropriate lifetime. 610 // 611 // p is interpreted starting at d, and may be absolute or relative (absolute vs 612 // relative paths both refer to the same target here, since p is absolute from 613 // d). p may contain "." and "..", but will not allow traversal above d (similar 614 // to ".." at the root dentry). 615 // 616 // This is useful for filesystem internals, where the filesystem may not be 617 // mounted yet. For a mounted filesystem, use GetDentryAt. 618 func (d *Dentry) WalkDentryTree(ctx context.Context, vfsObj *vfs.VirtualFilesystem, p fspath.Path) (*Dentry, error) { 619 d.fs.mu.RLock() 620 defer d.fs.processDeferredDecRefs(ctx) 621 defer d.fs.mu.RUnlock() 622 623 target := d 624 625 for pit := p.Begin; pit.Ok(); pit = pit.Next() { 626 pc := pit.String() 627 628 switch { 629 case target == nil: 630 return nil, linuxerr.ENOENT 631 case pc == ".": 632 // No-op, consume component and continue. 633 case pc == "..": 634 if target == d { 635 // Don't let .. traverse above the start point of the walk. 636 continue 637 } 638 target = target.parent.Load() 639 // Parent doesn't need revalidation since we revalidated it on the 640 // way to the child, and we're still holding fs.mu. 641 default: 642 var err error 643 644 d.dirMu.Lock() 645 target, err = d.fs.revalidateChildLocked(ctx, vfsObj, target, pc, target.children[pc]) 646 d.dirMu.Unlock() 647 648 if err != nil { 649 return nil, err 650 } 651 } 652 } 653 654 if target == nil { 655 return nil, linuxerr.ENOENT 656 } 657 658 target.IncRef() 659 return target, nil 660 } 661 662 // Parent returns the parent of this Dentry. This is not safe in general, the 663 // filesystem may concurrently move d elsewhere. The caller is responsible for 664 // ensuring the returned result remains valid while it is used. 665 func (d *Dentry) Parent() *Dentry { 666 return d.parent.Load() 667 } 668 669 // The Inode interface maps filesystem-level operations that operate on paths to 670 // equivalent operations on specific filesystem nodes. 671 // 672 // The interface methods are groups into logical categories as sub interfaces 673 // below. Generally, an implementation for each sub interface can be provided by 674 // embedding an appropriate type from inode_impl_utils.go. The sub interfaces 675 // are purely organizational. Methods declared directly in the main interface 676 // have no generic implementations, and should be explicitly provided by the 677 // client filesystem. 678 // 679 // Generally, implementations are not responsible for tasks that are common to 680 // all filesystems. These include: 681 // 682 // - Checking that dentries passed to methods are of the appropriate file type. 683 // - Checking permissions. 684 // 685 // Inode functions may be called holding filesystem wide locks and are not 686 // allowed to call vfs functions that may reenter, unless otherwise noted. 687 // 688 // Specific responsibilities of implementations are documented below. 689 type Inode interface { 690 // Methods related to reference counting. A generic implementation is 691 // provided by InodeNoopRefCount. These methods are generally called by the 692 // equivalent Dentry methods. 693 inodeRefs 694 695 // Methods related to node metadata. A generic implementation is provided by 696 // InodeAttrs. Note that a concrete filesystem using kernfs is responsible for 697 // managing link counts. 698 inodeMetadata 699 700 // Method for inodes that represent symlink. InodeNotSymlink provides a 701 // blanket implementation for all non-symlink inodes. 702 inodeSymlink 703 704 // Method for inodes that represent directories. InodeNotDirectory provides 705 // a blanket implementation for all non-directory inodes. 706 inodeDirectory 707 708 // Open creates a file description for the filesystem object represented by 709 // this inode. The returned file description should hold a reference on the 710 // dentry for its lifetime. 711 // 712 // Precondition: rp.Done(). vfsd.Impl() must be the kernfs Dentry containing 713 // the inode on which Open() is being called. 714 Open(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) 715 716 // StatFS returns filesystem statistics for the client filesystem. This 717 // corresponds to vfs.FilesystemImpl.StatFSAt. If the client filesystem 718 // doesn't support statfs(2), this should return ENOSYS. 719 StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error) 720 721 // Keep indicates whether the dentry created after Inode.Lookup should be 722 // kept in the kernfs dentry tree. 723 Keep() bool 724 725 // Valid should return true if this inode is still valid, or needs to 726 // be resolved again by a call to Lookup. 727 Valid(ctx context.Context) bool 728 729 // Watches returns the set of inotify watches associated with this inode. 730 Watches() *vfs.Watches 731 732 // Anonymous indicates that the Inode is anonymous. It will never have 733 // a name or parent. 734 Anonymous() bool 735 } 736 737 type inodeRefs interface { 738 IncRef() 739 DecRef(ctx context.Context) 740 TryIncRef() bool 741 } 742 743 type inodeMetadata interface { 744 // CheckPermissions checks that creds may access this inode for the 745 // requested access type, per the the rules of 746 // fs/namei.c:generic_permission(). 747 CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error 748 749 // Mode returns the (struct stat)::st_mode value for this inode. This is 750 // separated from Stat for performance. 751 Mode() linux.FileMode 752 753 // UID returns the (struct stat)::st_uid value for this inode. This is 754 // separated from Stat for performance. 755 UID() auth.KUID 756 757 // GID returns the (struct stat)::st_gid value for this inode. This is 758 // separated from Stat for performance. 759 GID() auth.KGID 760 761 // Stat returns the metadata for this inode. This corresponds to 762 // vfs.FilesystemImpl.StatAt. 763 Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) 764 765 // SetStat updates the metadata for this inode. This corresponds to 766 // vfs.FilesystemImpl.SetStatAt. Implementations are responsible for checking 767 // if the operation can be performed (see vfs.CheckSetStat() for common 768 // checks). 769 SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error 770 } 771 772 // Precondition: All methods in this interface may only be called on directory 773 // inodes. 774 type inodeDirectory interface { 775 // The New{File,Dir,Node,Link,Symlink} methods below should return a new inode 776 // that will be hashed into the dentry tree. 777 // 778 // These inode constructors are inode-level operations rather than 779 // filesystem-level operations to allow client filesystems to mix different 780 // implementations based on the new node's location in the 781 // filesystem. 782 783 // HasChildren returns true if the directory inode has any children. 784 HasChildren() bool 785 786 // NewFile creates a new regular file inode. 787 NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (Inode, error) 788 789 // NewDir creates a new directory inode. 790 NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (Inode, error) 791 792 // NewLink creates a new hardlink to a specified inode in this 793 // directory. Implementations should create a new kernfs Dentry pointing to 794 // target, and update target's link count. 795 NewLink(ctx context.Context, name string, target Inode) (Inode, error) 796 797 // NewSymlink creates a new symbolic link inode. 798 NewSymlink(ctx context.Context, name, target string) (Inode, error) 799 800 // NewNode creates a new filesystem node for a mknod syscall. 801 NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (Inode, error) 802 803 // Unlink removes a child dentry from this directory inode. 804 Unlink(ctx context.Context, name string, child Inode) error 805 806 // RmDir removes an empty child directory from this directory 807 // inode. Implementations must update the parent directory's link count, 808 // if required. Implementations are not responsible for checking that child 809 // is a directory, or checking for an empty directory. 810 RmDir(ctx context.Context, name string, child Inode) error 811 812 // Rename is called on the source directory containing an inode being 813 // renamed. child points to the resolved child in the source directory. 814 // dstDir is guaranteed to be a directory inode. 815 // 816 // On a successful call to Rename, the caller updates the dentry tree to 817 // reflect the name change. 818 // 819 // Precondition: Caller must serialize concurrent calls to Rename. 820 Rename(ctx context.Context, oldname, newname string, child, dstDir Inode) error 821 822 // Lookup should return an appropriate inode if name should resolve to a 823 // child of this directory inode. This gives the directory an opportunity 824 // on every lookup to resolve additional entries. This is only called when 825 // the inode is a directory. 826 // 827 // The child returned by Lookup will be hashed into the VFS dentry tree, 828 // at least for the duration of the current FS operation. 829 // 830 // Lookup must return the child with an extra reference whose ownership is 831 // transferred to the dentry that is created to point to that inode. If 832 // Inode.Keep returns false, that new dentry will be dropped at the end of 833 // the current filesystem operation (before returning back to the VFS 834 // layer) if no other ref is picked on that dentry. If Inode.Keep returns 835 // true, then the dentry will be cached into the dentry tree until it is 836 // Unlink'd or RmDir'd. 837 Lookup(ctx context.Context, name string) (Inode, error) 838 839 // IterDirents is used to iterate over dynamically created entries. It invokes 840 // cb on each entry in the directory represented by the Inode. 841 // 'offset' is the offset for the entire IterDirents call, which may include 842 // results from the caller (e.g. "." and ".."). 'relOffset' is the offset 843 // inside the entries returned by this IterDirents invocation. In other words, 844 // 'offset' should be used to calculate each vfs.Dirent.NextOff as well as 845 // the return value, while 'relOffset' is the place to start iteration. 846 IterDirents(ctx context.Context, mnt *vfs.Mount, callback vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error) 847 } 848 849 type inodeSymlink interface { 850 // Readlink returns the target of a symbolic link. If an inode is not a 851 // symlink, the implementation should return EINVAL. 852 // 853 // Readlink is called with no kernfs locks held, so it may reenter if needed 854 // to resolve symlink targets. 855 Readlink(ctx context.Context, mnt *vfs.Mount) (string, error) 856 857 // Getlink returns the target of a symbolic link, as used by path 858 // resolution: 859 // 860 // - If the inode is a "magic link" (a link whose target is most accurately 861 // represented as a VirtualDentry), Getlink returns (ok VirtualDentry, "", 862 // nil). A reference is taken on the returned VirtualDentry. 863 // 864 // - If the inode is an ordinary symlink, Getlink returns (zero-value 865 // VirtualDentry, symlink target, nil). 866 // 867 // - If the inode is not a symlink, Getlink returns (zero-value 868 // VirtualDentry, "", EINVAL). 869 Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error) 870 }