github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/fs/dirent.go (about) 1 // Copyright 2018 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package fs 16 17 import ( 18 "fmt" 19 "path" 20 "sync/atomic" 21 22 "golang.org/x/sys/unix" 23 "github.com/SagerNet/gvisor/pkg/abi/linux" 24 "github.com/SagerNet/gvisor/pkg/context" 25 "github.com/SagerNet/gvisor/pkg/errors/linuxerr" 26 "github.com/SagerNet/gvisor/pkg/refs" 27 "github.com/SagerNet/gvisor/pkg/sentry/kernel/auth" 28 "github.com/SagerNet/gvisor/pkg/sentry/socket/unix/transport" 29 "github.com/SagerNet/gvisor/pkg/sentry/uniqueid" 30 "github.com/SagerNet/gvisor/pkg/sync" 31 "github.com/SagerNet/gvisor/pkg/syserror" 32 ) 33 34 type globalDirentMap struct { 35 mu sync.Mutex 36 dirents map[*Dirent]struct{} 37 } 38 39 func (g *globalDirentMap) add(d *Dirent) { 40 g.mu.Lock() 41 g.dirents[d] = struct{}{} 42 g.mu.Unlock() 43 } 44 45 func (g *globalDirentMap) remove(d *Dirent) { 46 g.mu.Lock() 47 delete(g.dirents, d) 48 g.mu.Unlock() 49 } 50 51 // allDirents keeps track of all Dirents that need to be considered in 52 // Save/Restore for inode mappings. 53 // 54 // Because inodes do not hold paths, but inodes for external file systems map 55 // to an external path, every user-visible Dirent is stored in this map and 56 // iterated through upon save to keep inode ID -> restore path mappings. 57 var allDirents = globalDirentMap{ 58 dirents: map[*Dirent]struct{}{}, 59 } 60 61 // renameMu protects the parent of *all* Dirents. (See explanation in 62 // lockForRename.) 63 // 64 // See fs.go for lock ordering. 65 var renameMu sync.RWMutex 66 67 // Dirent holds an Inode in memory. 68 // 69 // A Dirent may be negative or positive: 70 // 71 // A negative Dirent contains a nil Inode and indicates that a path does not exist. This 72 // is a convention taken from the Linux dcache, see fs/dcache.c. A negative Dirent remains 73 // cached until a create operation replaces it with a positive Dirent. A negative Dirent 74 // always has one reference owned by its parent and takes _no_ reference on its parent. This 75 // ensures that its parent can be unhashed regardless of negative children. 76 // 77 // A positive Dirent contains a non-nil Inode. It remains cached for as long as there remain 78 // references to it. A positive Dirent always takes a reference on its parent. 79 // 80 // A Dirent may be a root Dirent (parent is nil) or be parented (non-nil parent). 81 // 82 // Dirents currently do not attempt to free entries that lack application references under 83 // memory pressure. 84 // 85 // +stateify savable 86 type Dirent struct { 87 // AtomicRefCount is our reference count. 88 refs.AtomicRefCount 89 90 // userVisible indicates whether the Dirent is visible to the user or 91 // not. Only user-visible Dirents should save inode mappings in 92 // save/restore, as only they hold the real path to the underlying 93 // inode. 94 // 95 // See newDirent and Dirent.afterLoad. 96 userVisible bool 97 98 // Inode is the underlying file object. 99 // 100 // Inode is exported currently to assist in implementing overlay Inodes (where a 101 // Inode.InodeOperations.Lookup may need to merge the Inode contained in a positive Dirent with 102 // another Inode). This is normally done before the Dirent is parented (there are 103 // no external references to it). 104 // 105 // Other objects in the VFS may take a reference to this Inode but only while holding 106 // a reference to this Dirent. 107 Inode *Inode 108 109 // name is the name (i.e. basename) of this entry. 110 // 111 // N.B. name is protected by parent.mu, not this node's mu! 112 name string 113 114 // parent is the parent directory. 115 // 116 // We hold a hard reference to the parent. 117 // 118 // parent is protected by renameMu. 119 parent *Dirent 120 121 // deleted may be set atomically when removed. 122 deleted int32 123 124 // mounted is true if Dirent is a mount point, similar to include/linux/dcache.h:DCACHE_MOUNTED. 125 mounted bool 126 127 // direntEntry identifies this Dirent as an element in a DirentCache. DirentCaches 128 // and their contents are not saved. 129 direntEntry `state:"nosave"` 130 131 // dirMu is a read-write mutex that protects caching decisions made by directory operations. 132 // Lock ordering: dirMu must be taken before mu (see below). Details: 133 // 134 // dirMu does not participate in Rename; instead mu and renameMu are used, see lockForRename. 135 // 136 // Creation and Removal operations must be synchronized with Walk to prevent stale negative 137 // caching. Note that this requirement is not specific to a _Dirent_ doing negative caching. 138 // The following race exists at any level of the VFS: 139 // 140 // For an object D that represents a directory, containing a cache of non-existent paths, 141 // protected by D.cacheMu: 142 // 143 // T1: T2: 144 // D.lookup(name) 145 // --> ENOENT 146 // D.create(name) 147 // --> success 148 // D.cacheMu.Lock 149 // delete(D.cache, name) 150 // D.cacheMu.Unlock 151 // D.cacheMu.Lock 152 // D.cache[name] = true 153 // D.cacheMu.Unlock 154 // 155 // D.lookup(name) 156 // D.cacheMu.Lock 157 // if D.cache[name] { 158 // --> ENOENT (wrong) 159 // } 160 // D.cacheMu.Lock 161 // 162 // Correct: 163 // 164 // T1: T2: 165 // D.cacheMu.Lock 166 // D.lookup(name) 167 // --> ENOENT 168 // D.cache[name] = true 169 // D.cacheMu.Unlock 170 // D.cacheMu.Lock 171 // D.create(name) 172 // --> success 173 // delete(D.cache, name) 174 // D.cacheMu.Unlock 175 // 176 // D.cacheMu.Lock 177 // D.lookup(name) 178 // --> EXISTS (right) 179 // D.cacheMu.Unlock 180 // 181 // Note that the above "correct" solution causes too much lock contention: all lookups are 182 // synchronized with each other. This is a problem because lookups are involved in any VFS 183 // path operation. 184 // 185 // A Dirent diverges from the single D.cacheMu and instead uses two locks: dirMu to protect 186 // concurrent creation/removal/lookup caching, and mu to protect the Dirent's children map 187 // in general. 188 // 189 // This allows for concurrent Walks to be executed in order to pipeline lookups. For instance 190 // for a hot directory /a/b, threads T1, T2, T3 will only block on each other update the 191 // children map of /a/b when their individual lookups complete. 192 // 193 // T1: T2: T3: 194 // stat(/a/b/c) stat(/a/b/d) stat(/a/b/e) 195 dirMu sync.RWMutex `state:"nosave"` 196 197 // mu protects the below fields. Lock ordering: mu must be taken after dirMu. 198 mu sync.Mutex `state:"nosave"` 199 200 // children are cached via weak references. 201 children map[string]*refs.WeakRef `state:".(map[string]*Dirent)"` 202 } 203 204 // NewDirent returns a new root Dirent, taking the caller's reference on inode. The caller 205 // holds the only reference to the Dirent. Parents may call hashChild to parent this Dirent. 206 func NewDirent(ctx context.Context, inode *Inode, name string) *Dirent { 207 d := newDirent(inode, name) 208 allDirents.add(d) 209 d.userVisible = true 210 return d 211 } 212 213 // NewTransientDirent creates a transient Dirent that shouldn't actually be 214 // visible to users. 215 // 216 // An Inode is required. 217 func NewTransientDirent(inode *Inode) *Dirent { 218 if inode == nil { 219 panic("an inode is required") 220 } 221 return newDirent(inode, "transient") 222 } 223 224 func newDirent(inode *Inode, name string) *Dirent { 225 // The Dirent needs to maintain one reference to MountSource. 226 if inode != nil { 227 inode.MountSource.IncDirentRefs() 228 } 229 d := Dirent{ 230 Inode: inode, 231 name: name, 232 children: make(map[string]*refs.WeakRef), 233 } 234 d.EnableLeakCheck("fs.Dirent") 235 return &d 236 } 237 238 // NewNegativeDirent returns a new root negative Dirent. Otherwise same as NewDirent. 239 func NewNegativeDirent(name string) *Dirent { 240 return newDirent(nil, name) 241 } 242 243 // IsRoot returns true if d is a root Dirent. 244 func (d *Dirent) IsRoot() bool { 245 return d.parent == nil 246 } 247 248 // IsNegative returns true if d represents a path that does not exist. 249 func (d *Dirent) IsNegative() bool { 250 return d.Inode == nil 251 } 252 253 // hashChild will hash child into the children list of its new parent d. 254 // 255 // Returns (*WeakRef, true) if hashing child caused a Dirent to be unhashed. The caller must 256 // validate the returned unhashed weak reference. Common cases: 257 // 258 // * Remove: hashing a negative Dirent unhashes a positive Dirent (unimplemented). 259 // * Create: hashing a positive Dirent unhashes a negative Dirent. 260 // * Lookup: hashing any Dirent should not unhash any other Dirent. 261 // 262 // Preconditions: 263 // * d.mu must be held. 264 // * child must be a root Dirent. 265 func (d *Dirent) hashChild(child *Dirent) (*refs.WeakRef, bool) { 266 if !child.IsRoot() { 267 panic("hashChild must be a root Dirent") 268 } 269 270 // Assign parentage. 271 child.parent = d 272 273 // Avoid letting negative Dirents take a reference on their parent; these Dirents 274 // don't have a role outside of the Dirent cache and should not keep their parent 275 // indefinitely pinned. 276 if !child.IsNegative() { 277 // Positive dirents must take a reference on their parent. 278 d.IncRef() 279 } 280 281 return d.hashChildParentSet(child) 282 } 283 284 // hashChildParentSet will rehash child into the children list of its parent d. 285 // 286 // Assumes that child.parent = d already. 287 func (d *Dirent) hashChildParentSet(child *Dirent) (*refs.WeakRef, bool) { 288 if child.parent != d { 289 panic("hashChildParentSet assumes the child already belongs to the parent") 290 } 291 292 // Save any replaced child so our caller can validate it. 293 old, ok := d.children[child.name] 294 295 // Hash the child. 296 d.children[child.name] = refs.NewWeakRef(child, nil) 297 298 // Return any replaced child. 299 return old, ok 300 } 301 302 // SyncAll iterates through mount points under d and writes back their buffered 303 // modifications to filesystems. 304 func (d *Dirent) SyncAll(ctx context.Context) { 305 d.mu.Lock() 306 defer d.mu.Unlock() 307 308 // For negative Dirents there is nothing to sync. By definition these are 309 // leaves (there is nothing left to traverse). 310 if d.IsNegative() { 311 return 312 } 313 314 // There is nothing to sync for a read-only filesystem. 315 if !d.Inode.MountSource.Flags.ReadOnly { 316 // NOTE(b/34856369): This should be a mount traversal, not a Dirent 317 // traversal, because some Inodes that need to be synced may no longer 318 // be reachable by name (after sys_unlink). 319 // 320 // Write out metadata, dirty page cached pages, and sync disk/remote 321 // caches. 322 d.Inode.WriteOut(ctx) 323 } 324 325 // Continue iterating through other mounted filesystems. 326 for _, w := range d.children { 327 if child := w.Get(); child != nil { 328 child.(*Dirent).SyncAll(ctx) 329 child.DecRef(ctx) 330 } 331 } 332 } 333 334 // BaseName returns the base name of the dirent. 335 func (d *Dirent) BaseName() string { 336 p := d.parent 337 if p == nil { 338 return d.name 339 } 340 p.mu.Lock() 341 defer p.mu.Unlock() 342 return d.name 343 } 344 345 // FullName returns the fully-qualified name and a boolean value representing 346 // whether this Dirent was a descendant of root. 347 // If the root argument is nil it is assumed to be the root of the Dirent tree. 348 func (d *Dirent) FullName(root *Dirent) (string, bool) { 349 renameMu.RLock() 350 defer renameMu.RUnlock() 351 return d.fullName(root) 352 } 353 354 // fullName returns the fully-qualified name and a boolean value representing 355 // if the root node was reachable from this Dirent. 356 func (d *Dirent) fullName(root *Dirent) (string, bool) { 357 if d == root { 358 return "/", true 359 } 360 361 if d.IsRoot() { 362 if root != nil { 363 // We reached the top of the Dirent tree but did not encounter 364 // the given root. Return false for reachable so the caller 365 // can handle this situation accordingly. 366 return d.name, false 367 } 368 return d.name, true 369 } 370 371 // Traverse up to parent. 372 d.parent.mu.Lock() 373 name := d.name 374 d.parent.mu.Unlock() 375 parentName, reachable := d.parent.fullName(root) 376 s := path.Join(parentName, name) 377 if atomic.LoadInt32(&d.deleted) != 0 { 378 return s + " (deleted)", reachable 379 } 380 return s, reachable 381 } 382 383 // MountRoot finds and returns the mount-root for a given dirent. 384 func (d *Dirent) MountRoot() *Dirent { 385 renameMu.RLock() 386 defer renameMu.RUnlock() 387 388 mountRoot := d 389 for !mountRoot.mounted && mountRoot.parent != nil { 390 mountRoot = mountRoot.parent 391 } 392 mountRoot.IncRef() 393 return mountRoot 394 } 395 396 // descendantOf returns true if the receiver dirent is equal to, or a 397 // descendant of, the argument dirent. 398 // 399 // d.mu must be held. 400 func (d *Dirent) descendantOf(p *Dirent) bool { 401 if d == p { 402 return true 403 } 404 if d.IsRoot() { 405 return false 406 } 407 return d.parent.descendantOf(p) 408 } 409 410 // walk walks to path name starting at the dirent, and will not traverse above 411 // root Dirent. 412 // 413 // If walkMayUnlock is true then walk can unlock d.mu to execute a slow 414 // Inode.Lookup, otherwise walk will keep d.mu locked. 415 // 416 // Preconditions: 417 // * renameMu must be held for reading. 418 // * d.mu must be held. 419 // * name must must not contain "/"s. 420 func (d *Dirent) walk(ctx context.Context, root *Dirent, name string, walkMayUnlock bool) (*Dirent, error) { 421 if !IsDir(d.Inode.StableAttr) { 422 return nil, unix.ENOTDIR 423 } 424 425 if name == "" || name == "." { 426 d.IncRef() 427 return d, nil 428 } else if name == ".." { 429 // Respect the chroot. Note that in Linux there is no check to enforce 430 // that d is a descendant of root. 431 if d == root { 432 d.IncRef() 433 return d, nil 434 } 435 // Are we already at the root? Then ".." is ".". 436 if d.IsRoot() { 437 d.IncRef() 438 return d, nil 439 } 440 d.parent.IncRef() 441 return d.parent, nil 442 } 443 444 if w, ok := d.children[name]; ok { 445 // Try to resolve the weak reference to a hard reference. 446 if child := w.Get(); child != nil { 447 cd := child.(*Dirent) 448 449 // Is this a negative Dirent? 450 if cd.IsNegative() { 451 // Don't leak a reference; this doesn't matter as much for negative Dirents, 452 // which don't hold a hard reference on their parent (their parent holds a 453 // hard reference on them, and they contain virtually no state). But this is 454 // good house-keeping. 455 child.DecRef(ctx) 456 return nil, unix.ENOENT 457 } 458 459 // Do we need to revalidate this child? 460 // 461 // We never allow the file system to revalidate mounts, that could cause them 462 // to unexpectedly drop out before umount. 463 if cd.mounted || !cd.Inode.MountSource.Revalidate(ctx, name, d.Inode, cd.Inode) { 464 // Good to go. This is the fast-path. 465 return cd, nil 466 } 467 468 // If we're revalidating a child, we must ensure all inotify watches release 469 // their pins on the child. Inotify doesn't properly support filesystems that 470 // revalidate dirents (since watches are lost on revalidation), but if we fail 471 // to unpin the watches child will never be GCed. 472 cd.Inode.Watches.Unpin(ctx, cd) 473 474 // This child needs to be revalidated, fallthrough to unhash it. Make sure 475 // to not leak a reference from Get(). 476 // 477 // Note that previous lookups may still have a reference to this stale child; 478 // this can't be helped, but we can ensure that *new* lookups are up-to-date. 479 child.DecRef(ctx) 480 } 481 482 // Either our weak reference expired or we need to revalidate it. Unhash child first, we're 483 // about to replace it. 484 delete(d.children, name) 485 w.Drop(ctx) 486 } 487 488 // Slow path: load the InodeOperations into memory. Since this is a hot path and the lookup may be 489 // expensive, if possible release the lock and re-acquire it. 490 if walkMayUnlock { 491 d.mu.Unlock() // +checklocksforce: results in an inconsistent block. 492 } 493 c, err := d.Inode.Lookup(ctx, name) 494 if walkMayUnlock { 495 d.mu.Lock() // +checklocksforce: see above. 496 } 497 // No dice. 498 if err != nil { 499 return nil, err 500 } 501 502 // Sanity check c, its name must be consistent. 503 if c.name != name { 504 panic(fmt.Sprintf("lookup from %q to %q returned unexpected name %q", d.name, name, c.name)) 505 } 506 507 // Now that we have the lock again, check if we raced. 508 if w, ok := d.children[name]; ok { 509 // Someone else looked up or created a child at name before us. 510 if child := w.Get(); child != nil { 511 cd := child.(*Dirent) 512 513 // There are active references to the existing child, prefer it to the one we 514 // retrieved from Lookup. Likely the Lookup happened very close to the insertion 515 // of child, so considering one stale over the other is fairly arbitrary. 516 c.DecRef(ctx) 517 518 // The child that was installed could be negative. 519 if cd.IsNegative() { 520 // If so, don't leak a reference and short circuit. 521 child.DecRef(ctx) 522 return nil, unix.ENOENT 523 } 524 525 // We make the judgement call that if c raced with cd they are close enough to have 526 // the same staleness, so we don't attempt to revalidate cd. In Linux revalidations 527 // can continue indefinitely (see fs/namei.c, retry_estale); we try to avoid this. 528 return cd, nil 529 } 530 531 // Weak reference expired. We went through a full cycle of create/destroy in the time 532 // we did the Inode.Lookup. Fully drop the weak reference and fallback to using the child 533 // we looked up. 534 delete(d.children, name) 535 w.Drop(ctx) 536 } 537 538 // Give the looked up child a parent. We cannot kick out entries, since we just checked above 539 // that there is nothing at name in d's children list. 540 if _, kicked := d.hashChild(c); kicked { 541 // Yell loudly. 542 panic(fmt.Sprintf("hashed child %q over existing child", c.name)) 543 } 544 545 // Is this a negative Dirent? 546 if c.IsNegative() { 547 // Don't drop a reference on the negative Dirent, it was just installed and this is the 548 // only reference we'll ever get. d owns the reference. 549 return nil, unix.ENOENT 550 } 551 552 // Return the positive Dirent. 553 return c, nil 554 } 555 556 // Walk walks to a new dirent, and will not walk higher than the given root 557 // Dirent, which must not be nil. 558 func (d *Dirent) Walk(ctx context.Context, root *Dirent, name string) (*Dirent, error) { 559 if root == nil { 560 panic("Dirent.Walk: root must not be nil") 561 } 562 563 // We could use lockDirectory here, but this is a hot path and we want 564 // to avoid defer. 565 renameMu.RLock() 566 d.dirMu.RLock() 567 d.mu.Lock() 568 569 child, err := d.walk(ctx, root, name, true /* may unlock */) 570 571 d.mu.Unlock() 572 d.dirMu.RUnlock() 573 renameMu.RUnlock() 574 575 return child, err 576 } 577 578 // exists returns true if name exists in relation to d. 579 // 580 // Preconditions: 581 // * renameMu must be held for reading. 582 // * d.mu must be held. 583 // * name must must not contain "/"s. 584 func (d *Dirent) exists(ctx context.Context, root *Dirent, name string) bool { 585 child, err := d.walk(ctx, root, name, false /* may unlock */) 586 if err != nil { 587 // Child may not exist. 588 return false 589 } 590 // Child exists. 591 child.DecRef(ctx) 592 return true 593 } 594 595 // lockDirectory should be called for any operation that changes this `d`s 596 // children (creating or removing them). 597 // +checklocksacquire:d.dirMu 598 // +checklocksacquire:d.mu 599 func (d *Dirent) lockDirectory() { 600 renameMu.RLock() 601 d.dirMu.Lock() 602 d.mu.Lock() 603 } 604 605 // unlockDirectory is the reverse of lockDirectory. 606 // +checklocksrelease:d.dirMu 607 // +checklocksrelease:d.mu 608 func (d *Dirent) unlockDirectory() { 609 d.mu.Unlock() 610 d.dirMu.Unlock() 611 renameMu.RUnlock() // +checklocksforce: see lockDirectory. 612 } 613 614 // Create creates a new regular file in this directory. 615 func (d *Dirent) Create(ctx context.Context, root *Dirent, name string, flags FileFlags, perms FilePermissions) (*File, error) { 616 d.lockDirectory() 617 defer d.unlockDirectory() 618 619 // Does something already exist? 620 if d.exists(ctx, root, name) { 621 return nil, unix.EEXIST 622 } 623 624 // Try the create. We need to trust the file system to return EEXIST (or something 625 // that will translate to EEXIST) if name already exists. 626 file, err := d.Inode.Create(ctx, d, name, flags, perms) 627 if err != nil { 628 return nil, err 629 } 630 child := file.Dirent 631 632 d.finishCreate(ctx, child, name) 633 634 // Return the reference and the new file. When the last reference to 635 // the file is dropped, file.Dirent may no longer be cached. 636 return file, nil 637 } 638 639 // finishCreate validates the created file, adds it as a child of this dirent, 640 // and notifies any watchers. 641 func (d *Dirent) finishCreate(ctx context.Context, child *Dirent, name string) { 642 // Sanity check c, its name must be consistent. 643 if child.name != name { 644 panic(fmt.Sprintf("create from %q to %q returned unexpected name %q", d.name, name, child.name)) 645 } 646 647 // File systems cannot return a negative Dirent on Create, that makes no sense. 648 if child.IsNegative() { 649 panic(fmt.Sprintf("create from %q to %q returned negative Dirent", d.name, name)) 650 } 651 652 // Hash the child into its parent. We can only kick out a Dirent if it is negative 653 // (we are replacing something that does not exist with something that now does). 654 if w, kicked := d.hashChild(child); kicked { 655 if old := w.Get(); old != nil { 656 if !old.(*Dirent).IsNegative() { 657 panic(fmt.Sprintf("hashed child %q over a positive child", child.name)) 658 } 659 // Don't leak a reference. 660 old.DecRef(ctx) 661 662 // Drop d's reference. 663 old.DecRef(ctx) 664 } 665 666 // Finally drop the useless weak reference on the floor. 667 w.Drop(ctx) 668 } 669 670 d.Inode.Watches.Notify(name, linux.IN_CREATE, 0) 671 672 // Allow the file system to take extra references on c. 673 child.maybeExtendReference() 674 } 675 676 // genericCreate executes create if name does not exist. Removes a negative Dirent at name if 677 // create succeeds. 678 func (d *Dirent) genericCreate(ctx context.Context, root *Dirent, name string, create func() error) error { 679 d.lockDirectory() 680 defer d.unlockDirectory() 681 682 // Does something already exist? 683 if d.exists(ctx, root, name) { 684 return unix.EEXIST 685 } 686 687 // Remove any negative Dirent. We've already asserted above with d.exists 688 // that the only thing remaining here can be a negative Dirent. 689 if w, ok := d.children[name]; ok { 690 // Same as Create. 691 if old := w.Get(); old != nil { 692 if !old.(*Dirent).IsNegative() { 693 panic(fmt.Sprintf("hashed over a positive child %q", old.(*Dirent).name)) 694 } 695 // Don't leak a reference. 696 old.DecRef(ctx) 697 698 // Drop d's reference. 699 old.DecRef(ctx) 700 } 701 702 // Unhash the negative Dirent, name needs to exist now. 703 delete(d.children, name) 704 705 // Finally drop the useless weak reference on the floor. 706 w.Drop(ctx) 707 } 708 709 // Execute the create operation. 710 return create() 711 } 712 713 // CreateLink creates a new link in this directory. 714 func (d *Dirent) CreateLink(ctx context.Context, root *Dirent, oldname, newname string) error { 715 return d.genericCreate(ctx, root, newname, func() error { 716 if err := d.Inode.CreateLink(ctx, d, oldname, newname); err != nil { 717 return err 718 } 719 d.Inode.Watches.Notify(newname, linux.IN_CREATE, 0) 720 return nil 721 }) 722 } 723 724 // CreateHardLink creates a new hard link in this directory. 725 func (d *Dirent) CreateHardLink(ctx context.Context, root *Dirent, target *Dirent, name string) error { 726 // Make sure that target does not span filesystems. 727 if d.Inode.MountSource != target.Inode.MountSource { 728 return unix.EXDEV 729 } 730 731 // Directories are never linkable. See fs/namei.c:vfs_link. 732 if IsDir(target.Inode.StableAttr) { 733 return unix.EPERM 734 } 735 736 return d.genericCreate(ctx, root, name, func() error { 737 if err := d.Inode.CreateHardLink(ctx, d, target, name); err != nil { 738 return err 739 } 740 target.Inode.Watches.Notify("", linux.IN_ATTRIB, 0) // Link count change. 741 d.Inode.Watches.Notify(name, linux.IN_CREATE, 0) 742 return nil 743 }) 744 } 745 746 // CreateDirectory creates a new directory under this dirent. 747 func (d *Dirent) CreateDirectory(ctx context.Context, root *Dirent, name string, perms FilePermissions) error { 748 return d.genericCreate(ctx, root, name, func() error { 749 if err := d.Inode.CreateDirectory(ctx, d, name, perms); err != nil { 750 return err 751 } 752 d.Inode.Watches.Notify(name, linux.IN_ISDIR|linux.IN_CREATE, 0) 753 return nil 754 }) 755 } 756 757 // Bind satisfies the InodeOperations interface; otherwise same as GetFile. 758 func (d *Dirent) Bind(ctx context.Context, root *Dirent, name string, data transport.BoundEndpoint, perms FilePermissions) (*Dirent, error) { 759 var childDir *Dirent 760 err := d.genericCreate(ctx, root, name, func() error { 761 var e error 762 childDir, e = d.Inode.Bind(ctx, d, name, data, perms) 763 if e != nil { 764 return e 765 } 766 d.finishCreate(ctx, childDir, name) 767 return nil 768 }) 769 if err == unix.EEXIST { 770 return nil, unix.EADDRINUSE 771 } 772 if err != nil { 773 return nil, err 774 } 775 return childDir, err 776 } 777 778 // CreateFifo creates a new named pipe under this dirent. 779 func (d *Dirent) CreateFifo(ctx context.Context, root *Dirent, name string, perms FilePermissions) error { 780 return d.genericCreate(ctx, root, name, func() error { 781 if err := d.Inode.CreateFifo(ctx, d, name, perms); err != nil { 782 return err 783 } 784 d.Inode.Watches.Notify(name, linux.IN_CREATE, 0) 785 return nil 786 }) 787 } 788 789 // GetDotAttrs returns the DentAttrs corresponding to "." and ".." directories. 790 func (d *Dirent) GetDotAttrs(root *Dirent) (DentAttr, DentAttr) { 791 // Get '.'. 792 sattr := d.Inode.StableAttr 793 dot := DentAttr{ 794 Type: sattr.Type, 795 InodeID: sattr.InodeID, 796 } 797 798 // Hold d.mu while we call d.descendantOf. 799 d.mu.Lock() 800 defer d.mu.Unlock() 801 802 // Get '..'. 803 if !d.IsRoot() && d.descendantOf(root) { 804 // Dirent is a descendant of the root. Get its parent's attrs. 805 psattr := d.parent.Inode.StableAttr 806 dotdot := DentAttr{ 807 Type: psattr.Type, 808 InodeID: psattr.InodeID, 809 } 810 return dot, dotdot 811 } 812 // Dirent is either root or not a descendant of the root. ".." is the 813 // same as ".". 814 return dot, dot 815 } 816 817 // DirIterator is an open directory containing directory entries that can be read. 818 type DirIterator interface { 819 // IterateDir emits directory entries by calling dirCtx.EmitDir, beginning 820 // with the entry at offset and returning the next directory offset. 821 // 822 // Entries for "." and ".." must *not* be included. 823 // 824 // If the offset returned is the same as the argument offset, then 825 // nothing has been serialized. This is equivalent to reaching EOF. 826 // In this case serializer.Written() should return 0. 827 // 828 // The order of entries to emit must be consistent between Readdir 829 // calls, and must start with the given offset. 830 // 831 // The caller must ensure that this operation is permitted. 832 IterateDir(ctx context.Context, d *Dirent, dirCtx *DirCtx, offset int) (int, error) 833 } 834 835 // DirentReaddir serializes the directory entries of d including "." and "..". 836 // 837 // Arguments: 838 // 839 // * d: the Dirent of the directory being read; required to provide "." and "..". 840 // * it: the directory iterator; which represents an open directory handle. 841 // * root: fs root; if d is equal to the root, then '..' will refer to d. 842 // * ctx: context provided to file systems in order to select and serialize entries. 843 // * offset: the current directory offset. 844 // 845 // Returns the offset of the *next* element which was not serialized. 846 func DirentReaddir(ctx context.Context, d *Dirent, it DirIterator, root *Dirent, dirCtx *DirCtx, offset int64) (int64, error) { 847 offset, err := direntReaddir(ctx, d, it, root, dirCtx, offset) 848 // Serializing any directory entries at all means success. 849 if dirCtx.Serializer.Written() > 0 { 850 return offset, nil 851 } 852 return offset, err 853 } 854 855 func direntReaddir(ctx context.Context, d *Dirent, it DirIterator, root *Dirent, dirCtx *DirCtx, offset int64) (int64, error) { 856 if root == nil { 857 panic("Dirent.Readdir: root must not be nil") 858 } 859 if dirCtx.Serializer == nil { 860 panic("Dirent.Readdir: serializer must not be nil") 861 } 862 863 // Check that this is actually a directory before emitting anything. 864 // Once we have written entries for "." and "..", future errors from 865 // IterateDir will be hidden. 866 if !IsDir(d.Inode.StableAttr) { 867 return 0, syserror.ENOTDIR 868 } 869 870 // This is a special case for lseek(fd, 0, SEEK_END). 871 // See SeekWithDirCursor for more details. 872 if offset == FileMaxOffset { 873 return offset, nil 874 } 875 876 // Collect attrs for "." and "..". 877 dot, dotdot := d.GetDotAttrs(root) 878 879 // Emit "." and ".." if the offset is low enough. 880 if offset == 0 { 881 // Serialize ".". 882 if err := dirCtx.DirEmit(".", dot); err != nil { 883 return offset, err 884 } 885 offset++ 886 } 887 if offset == 1 { 888 // Serialize "..". 889 if err := dirCtx.DirEmit("..", dotdot); err != nil { 890 return offset, err 891 } 892 offset++ 893 } 894 895 // it.IterateDir should be passed an offset that does not include the 896 // initial dot elements. We will add them back later. 897 offset -= 2 898 newOffset, err := it.IterateDir(ctx, d, dirCtx, int(offset)) 899 if int64(newOffset) < offset { 900 panic(fmt.Sprintf("node.Readdir returned offset %v less than input offset %v", newOffset, offset)) 901 } 902 // Add the initial nodes back to the offset count. 903 newOffset += 2 904 return int64(newOffset), err 905 } 906 907 // flush flushes all weak references recursively, and removes any cached 908 // references to children. 909 // 910 // Preconditions: d.mu must be held. 911 func (d *Dirent) flush(ctx context.Context) { 912 expired := make(map[string]*refs.WeakRef) 913 for n, w := range d.children { 914 // Call flush recursively on each child before removing our 915 // reference on it, and removing the cache's reference. 916 if child := w.Get(); child != nil { 917 cd := child.(*Dirent) 918 919 if !cd.IsNegative() { 920 // Flush the child. 921 cd.mu.Lock() 922 cd.flush(ctx) 923 cd.mu.Unlock() 924 925 // Allow the file system to drop extra references on child. 926 cd.dropExtendedReference() 927 } 928 929 // Don't leak a reference. 930 child.DecRef(ctx) 931 } 932 // Check if the child dirent is closed, and mark it as expired if it is. 933 // We must call w.Get() again here, since the child could have been closed 934 // by the calls to flush() and cache.Remove() in the above if-block. 935 if child := w.Get(); child != nil { 936 child.DecRef(ctx) 937 } else { 938 expired[n] = w 939 } 940 } 941 942 // Remove expired entries. 943 for n, w := range expired { 944 delete(d.children, n) 945 w.Drop(ctx) 946 } 947 } 948 949 // isMountPoint returns true if the dirent is a mount point or the root. 950 func (d *Dirent) isMountPoint() bool { 951 d.mu.Lock() 952 defer d.mu.Unlock() 953 return d.isMountPointLocked() 954 } 955 956 func (d *Dirent) isMountPointLocked() bool { 957 return d.mounted || d.parent == nil 958 } 959 960 // mount mounts a new dirent with the given inode over d. 961 // 962 // Precondition: must be called with mm.withMountLocked held on `d`. 963 func (d *Dirent) mount(ctx context.Context, inode *Inode) (newChild *Dirent, err error) { 964 // Did we race with deletion? 965 if atomic.LoadInt32(&d.deleted) != 0 { 966 return nil, syserror.ENOENT 967 } 968 969 // Refuse to mount a symlink. 970 // 971 // See Linux equivalent in fs/namespace.c:do_add_mount. 972 if IsSymlink(inode.StableAttr) { 973 return nil, linuxerr.EINVAL 974 } 975 976 // Dirent that'll replace d. 977 // 978 // Note that NewDirent returns with one reference taken; the reference 979 // is donated to the caller as the mount reference. 980 replacement := NewDirent(ctx, inode, d.name) 981 replacement.mounted = true 982 983 weakRef, ok := d.parent.hashChild(replacement) 984 if !ok { 985 panic("mount must mount over an existing dirent") 986 } 987 weakRef.Drop(ctx) 988 989 // Note that even though `d` is now hidden, it still holds a reference 990 // to its parent. 991 return replacement, nil 992 } 993 994 // unmount unmounts `d` and replaces it with the last Dirent that was in its 995 // place, supplied by the MountNamespace as `replacement`. 996 // 997 // Precondition: must be called with mm.withMountLocked held on `d`. 998 func (d *Dirent) unmount(ctx context.Context, replacement *Dirent) error { 999 // Did we race with deletion? 1000 if atomic.LoadInt32(&d.deleted) != 0 { 1001 return syserror.ENOENT 1002 } 1003 1004 // Remount our former child in its place. 1005 // 1006 // As replacement used to be our child, it must already have the right 1007 // parent. 1008 weakRef, ok := d.parent.hashChildParentSet(replacement) 1009 if !ok { 1010 panic("mount must mount over an existing dirent") 1011 } 1012 weakRef.Drop(ctx) 1013 1014 // d is not reachable anymore, and hence not mounted anymore. 1015 d.mounted = false 1016 1017 // Drop mount reference. 1018 d.DecRef(ctx) 1019 return nil 1020 } 1021 1022 // Remove removes the given file or symlink. The root dirent is used to 1023 // resolve name, and must not be nil. 1024 func (d *Dirent) Remove(ctx context.Context, root *Dirent, name string, dirPath bool) error { 1025 // Check the root. 1026 if root == nil { 1027 panic("Dirent.Remove: root must not be nil") 1028 } 1029 1030 d.lockDirectory() 1031 defer d.unlockDirectory() 1032 1033 // Try to walk to the node. 1034 child, err := d.walk(ctx, root, name, false /* may unlock */) 1035 if err != nil { 1036 // Child does not exist. 1037 return err 1038 } 1039 defer child.DecRef(ctx) 1040 1041 // Remove cannot remove directories. 1042 if IsDir(child.Inode.StableAttr) { 1043 return unix.EISDIR 1044 } else if dirPath { 1045 return unix.ENOTDIR 1046 } 1047 1048 // Remove cannot remove a mount point. 1049 if child.isMountPoint() { 1050 return unix.EBUSY 1051 } 1052 1053 // Try to remove name on the file system. 1054 if err := d.Inode.Remove(ctx, d, child); err != nil { 1055 return err 1056 } 1057 1058 // Link count changed, this only applies to non-directory nodes. 1059 child.Inode.Watches.Notify("", linux.IN_ATTRIB, 0) 1060 1061 // Mark name as deleted and remove from children. 1062 atomic.StoreInt32(&child.deleted, 1) 1063 if w, ok := d.children[name]; ok { 1064 delete(d.children, name) 1065 w.Drop(ctx) 1066 } 1067 1068 // Allow the file system to drop extra references on child. 1069 child.dropExtendedReference() 1070 1071 // Finally, let inotify know the child is being unlinked. Drop any extra 1072 // refs from inotify to this child dirent. This doesn't necessarily mean the 1073 // watches on the underlying inode will be destroyed, since the underlying 1074 // inode may have other links. If this was the last link, the events for the 1075 // watch removal will be queued by the inode destructor. 1076 child.Inode.Watches.MarkUnlinked() 1077 child.Inode.Watches.Unpin(ctx, child) 1078 d.Inode.Watches.Notify(name, linux.IN_DELETE, 0) 1079 1080 return nil 1081 } 1082 1083 // RemoveDirectory removes the given directory. The root dirent is used to 1084 // resolve name, and must not be nil. 1085 func (d *Dirent) RemoveDirectory(ctx context.Context, root *Dirent, name string) error { 1086 // Check the root. 1087 if root == nil { 1088 panic("Dirent.Remove: root must not be nil") 1089 } 1090 1091 d.lockDirectory() 1092 defer d.unlockDirectory() 1093 1094 // Check for dots. 1095 if name == "." { 1096 // Rejected as the last component by rmdir(2). 1097 return unix.EINVAL 1098 } 1099 if name == ".." { 1100 // If d was found, then its parent is not empty. 1101 return unix.ENOTEMPTY 1102 } 1103 1104 // Try to walk to the node. 1105 child, err := d.walk(ctx, root, name, false /* may unlock */) 1106 if err != nil { 1107 // Child does not exist. 1108 return err 1109 } 1110 defer child.DecRef(ctx) 1111 1112 // RemoveDirectory can only remove directories. 1113 if !IsDir(child.Inode.StableAttr) { 1114 return unix.ENOTDIR 1115 } 1116 1117 // Remove cannot remove a mount point. 1118 if child.isMountPoint() { 1119 return unix.EBUSY 1120 } 1121 1122 // Try to remove name on the file system. 1123 if err := d.Inode.Remove(ctx, d, child); err != nil { 1124 return err 1125 } 1126 1127 // Mark name as deleted and remove from children. 1128 atomic.StoreInt32(&child.deleted, 1) 1129 if w, ok := d.children[name]; ok { 1130 delete(d.children, name) 1131 w.Drop(ctx) 1132 } 1133 1134 // Allow the file system to drop extra references on child. 1135 child.dropExtendedReference() 1136 1137 // Finally, let inotify know the child is being unlinked. Drop any extra 1138 // refs from inotify to this child dirent. 1139 child.Inode.Watches.MarkUnlinked() 1140 child.Inode.Watches.Unpin(ctx, child) 1141 d.Inode.Watches.Notify(name, linux.IN_ISDIR|linux.IN_DELETE, 0) 1142 1143 return nil 1144 } 1145 1146 // destroy closes this node and all children. 1147 func (d *Dirent) destroy(ctx context.Context) { 1148 if d.IsNegative() { 1149 // Nothing to tear-down and no parent references to drop, since a negative 1150 // Dirent does not take a references on its parent, has no Inode and no children. 1151 return 1152 } 1153 1154 d.mu.Lock() 1155 defer d.mu.Unlock() 1156 1157 // Drop all weak references. 1158 for _, w := range d.children { 1159 if c := w.Get(); c != nil { 1160 if c.(*Dirent).IsNegative() { 1161 // The parent holds both weak and strong refs in the case of 1162 // negative dirents. 1163 c.DecRef(ctx) 1164 } 1165 // Drop the reference we just acquired in WeakRef.Get. 1166 c.DecRef(ctx) 1167 } 1168 w.Drop(ctx) 1169 } 1170 d.children = nil 1171 1172 allDirents.remove(d) 1173 1174 // Drop our reference to the Inode. 1175 d.Inode.DecRef(ctx) 1176 1177 // Allow the Dirent to be GC'ed after this point, since the Inode may still 1178 // be referenced after the Dirent is destroyed (for instance by filesystem 1179 // internal caches or hard links). 1180 d.Inode = nil 1181 1182 // Drop the reference we have on our parent if we took one. renameMu doesn't need to be 1183 // held because d can't be reparented without any references to it left. 1184 if d.parent != nil { 1185 d.parent.DecRef(ctx) 1186 } 1187 } 1188 1189 // IncRef increases the Dirent's refcount as well as its mount's refcount. 1190 // 1191 // IncRef implements RefCounter.IncRef. 1192 func (d *Dirent) IncRef() { 1193 if d.Inode != nil { 1194 d.Inode.MountSource.IncDirentRefs() 1195 } 1196 d.AtomicRefCount.IncRef() 1197 } 1198 1199 // TryIncRef implements RefCounter.TryIncRef. 1200 func (d *Dirent) TryIncRef() bool { 1201 ok := d.AtomicRefCount.TryIncRef() 1202 if ok && d.Inode != nil { 1203 d.Inode.MountSource.IncDirentRefs() 1204 } 1205 return ok 1206 } 1207 1208 // DecRef decreases the Dirent's refcount and drops its reference on its mount. 1209 // 1210 // DecRef implements RefCounter.DecRef with destructor d.destroy. 1211 func (d *Dirent) DecRef(ctx context.Context) { 1212 if d.Inode != nil { 1213 // Keep mount around, since DecRef may destroy d.Inode. 1214 msrc := d.Inode.MountSource 1215 d.DecRefWithDestructor(ctx, d.destroy) 1216 msrc.DecDirentRefs() 1217 } else { 1218 d.DecRefWithDestructor(ctx, d.destroy) 1219 } 1220 } 1221 1222 // InotifyEvent notifies all watches on the inode for this dirent and its parent 1223 // of potential events. The events may not actually propagate up to the user, 1224 // depending on the event masks. InotifyEvent automatically provides the name of 1225 // the current dirent as the subject of the event as required, and adds the 1226 // IN_ISDIR flag for dirents that refer to directories. 1227 func (d *Dirent) InotifyEvent(events, cookie uint32) { 1228 // N.B. We don't defer the unlocks because InotifyEvent is in the hot 1229 // path of all IO operations, and the defers cost too much for small IO 1230 // operations. 1231 renameMu.RLock() 1232 1233 if IsDir(d.Inode.StableAttr) { 1234 events |= linux.IN_ISDIR 1235 } 1236 1237 // The ordering below is important, Linux always notifies the parent first. 1238 if d.parent != nil { 1239 // name is immediately stale w.r.t. renames (renameMu doesn't 1240 // protect against renames in the same directory). Holding 1241 // d.parent.mu around Notify() wouldn't matter since Notify 1242 // doesn't provide a synchronous mechanism for reading the name 1243 // anyway. 1244 d.parent.mu.Lock() 1245 name := d.name 1246 d.parent.mu.Unlock() 1247 d.parent.Inode.Watches.Notify(name, events, cookie) 1248 } 1249 d.Inode.Watches.Notify("", events, cookie) 1250 1251 renameMu.RUnlock() 1252 } 1253 1254 // maybeExtendReference caches a reference on this Dirent if 1255 // MountSourceOperations.Keep returns true. 1256 func (d *Dirent) maybeExtendReference() { 1257 if msrc := d.Inode.MountSource; msrc.Keep(d) { 1258 msrc.fscache.Add(d) 1259 } 1260 } 1261 1262 // dropExtendedReference drops any cached reference held by the 1263 // MountSource on the dirent. 1264 func (d *Dirent) dropExtendedReference() { 1265 d.Inode.MountSource.fscache.Remove(d) 1266 } 1267 1268 // lockForRename takes locks on oldParent and newParent as required by Rename. 1269 // On return, unlockForRename must always be called, even with an error. 1270 // +checklocksacquire:oldParent.mu 1271 // +checklocksacquire:newParent.mu 1272 func lockForRename(oldParent *Dirent, oldName string, newParent *Dirent, newName string) error { 1273 renameMu.Lock() 1274 if oldParent == newParent { 1275 oldParent.mu.Lock() 1276 return nil // +checklocksforce: only one lock exists. 1277 } 1278 1279 // Renaming between directories is a bit subtle: 1280 // 1281 // - A concurrent cross-directory Rename may try to lock in the opposite 1282 // order; take renameMu to prevent this from happening. 1283 // 1284 // - If either directory is an ancestor of the other, then a concurrent 1285 // Remove may lock the descendant (in DecRef -> closeAll) while holding a 1286 // lock on the ancestor; to avoid this, ensure we take locks in the same 1287 // ancestor-to-descendant order. (Holding renameMu prevents this 1288 // relationship from changing.) 1289 1290 // First check if newParent is a descendant of oldParent. 1291 child := newParent 1292 for p := newParent.parent; p != nil; p = p.parent { 1293 if p == oldParent { 1294 oldParent.mu.Lock() 1295 newParent.mu.Lock() 1296 var err error 1297 if child.name == oldName { 1298 // newParent is not just a descendant of oldParent, but 1299 // more specifically of oldParent/oldName. That is, we're 1300 // trying to rename something into a subdirectory of 1301 // itself. 1302 err = unix.EINVAL 1303 } 1304 return err 1305 } 1306 child = p 1307 } 1308 1309 // Otherwise, either oldParent is a descendant of newParent or the two 1310 // have no relationship; in either case we can do this: 1311 newParent.mu.Lock() 1312 oldParent.mu.Lock() 1313 return nil 1314 } 1315 1316 // unlockForRename is the opposite of lockForRename. 1317 // +checklocksrelease:oldParent.mu 1318 // +checklocksrelease:newParent.mu 1319 func unlockForRename(oldParent, newParent *Dirent) { 1320 if oldParent == newParent { 1321 oldParent.mu.Unlock() 1322 renameMu.Unlock() // +checklocksforce: only one lock exists. 1323 return 1324 } 1325 newParent.mu.Unlock() 1326 oldParent.mu.Unlock() 1327 renameMu.Unlock() // +checklocksforce: not tracked. 1328 } 1329 1330 func (d *Dirent) checkSticky(ctx context.Context, victim *Dirent) error { 1331 uattr, err := d.Inode.UnstableAttr(ctx) 1332 if err != nil { 1333 return linuxerr.EPERM 1334 } 1335 if !uattr.Perms.Sticky { 1336 return nil 1337 } 1338 1339 creds := auth.CredentialsFromContext(ctx) 1340 if uattr.Owner.UID == creds.EffectiveKUID { 1341 return nil 1342 } 1343 1344 vuattr, err := victim.Inode.UnstableAttr(ctx) 1345 if err != nil { 1346 return linuxerr.EPERM 1347 } 1348 if vuattr.Owner.UID == creds.EffectiveKUID { 1349 return nil 1350 } 1351 if victim.Inode.CheckCapability(ctx, linux.CAP_FOWNER) { 1352 return nil 1353 } 1354 return linuxerr.EPERM 1355 } 1356 1357 // MayDelete determines whether `name`, a child of `d`, can be deleted or 1358 // renamed by `ctx`. 1359 // 1360 // Compare Linux kernel fs/namei.c:may_delete. 1361 func (d *Dirent) MayDelete(ctx context.Context, root *Dirent, name string) error { 1362 if err := d.Inode.CheckPermission(ctx, PermMask{Write: true, Execute: true}); err != nil { 1363 return err 1364 } 1365 1366 d.lockDirectory() 1367 defer d.unlockDirectory() 1368 1369 victim, err := d.walk(ctx, root, name, true /* may unlock */) 1370 if err != nil { 1371 return err 1372 } 1373 defer victim.DecRef(ctx) 1374 1375 return d.mayDelete(ctx, victim) 1376 } 1377 1378 // mayDelete determines whether `victim`, a child of `dir`, can be deleted or 1379 // renamed by `ctx`. 1380 // 1381 // Preconditions: `dir` is writable and executable by `ctx`. 1382 func (d *Dirent) mayDelete(ctx context.Context, victim *Dirent) error { 1383 if err := d.checkSticky(ctx, victim); err != nil { 1384 return err 1385 } 1386 1387 if victim.IsRoot() { 1388 return linuxerr.EBUSY 1389 } 1390 1391 return nil 1392 } 1393 1394 // Rename atomically converts the child of oldParent named oldName to a 1395 // child of newParent named newName. 1396 func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string, newParent *Dirent, newName string) error { 1397 if root == nil { 1398 panic("Rename: root must not be nil") 1399 } 1400 if oldParent == newParent && oldName == newName { 1401 return nil 1402 } 1403 1404 // Acquire global renameMu lock, and mu locks on oldParent/newParent. 1405 err := lockForRename(oldParent, oldName, newParent, newName) 1406 defer unlockForRename(oldParent, newParent) 1407 if err != nil { 1408 return err 1409 } 1410 1411 // Do we have general permission to remove from oldParent and 1412 // create/replace in newParent? 1413 if err := oldParent.Inode.CheckPermission(ctx, PermMask{Write: true, Execute: true}); err != nil { 1414 return err 1415 } 1416 if err := newParent.Inode.CheckPermission(ctx, PermMask{Write: true, Execute: true}); err != nil { 1417 return err 1418 } 1419 1420 // renamed is the dirent that will be renamed to something else. 1421 renamed, err := oldParent.walk(ctx, root, oldName, false /* may unlock */) 1422 if err != nil { 1423 return err 1424 } 1425 defer renamed.DecRef(ctx) 1426 1427 // Check that the renamed dirent is deletable. 1428 if err := oldParent.mayDelete(ctx, renamed); err != nil { 1429 return err 1430 } 1431 1432 // Check that the renamed dirent is not a mount point. 1433 if renamed.isMountPointLocked() { 1434 return unix.EBUSY 1435 } 1436 1437 // Source should not be an ancestor of the target. 1438 if newParent.descendantOf(renamed) { 1439 return unix.EINVAL 1440 } 1441 1442 // Per rename(2): "... EACCES: ... or oldpath is a directory and does not 1443 // allow write permission (needed to update the .. entry)." 1444 if IsDir(renamed.Inode.StableAttr) { 1445 if err := renamed.Inode.CheckPermission(ctx, PermMask{Write: true}); err != nil { 1446 return err 1447 } 1448 } 1449 1450 // replaced is the dirent that is being overwritten by rename. 1451 replaced, err := newParent.walk(ctx, root, newName, false /* may unlock */) 1452 if err != nil { 1453 if !linuxerr.Equals(linuxerr.ENOENT, err) { 1454 return err 1455 } 1456 1457 // newName doesn't exist; simply create it below. 1458 replaced = nil 1459 } else { 1460 // Check constraints on the dirent being replaced. 1461 1462 // NOTE(b/111808347): We don't want to keep replaced alive 1463 // across the Rename, so must call DecRef manually (no defer). 1464 1465 // Check that we can delete replaced. 1466 if err := newParent.mayDelete(ctx, replaced); err != nil { 1467 replaced.DecRef(ctx) 1468 return err 1469 } 1470 1471 // Target should not be an ancestor of source. 1472 if oldParent.descendantOf(replaced) { 1473 replaced.DecRef(ctx) 1474 1475 // Note that Linux returns EINVAL if the source is an 1476 // ancestor of target, but ENOTEMPTY if the target is 1477 // an ancestor of source (unless RENAME_EXCHANGE flag 1478 // is present). See fs/namei.c:renameat2. 1479 return unix.ENOTEMPTY 1480 } 1481 1482 // Check that replaced is not a mount point. 1483 if replaced.isMountPointLocked() { 1484 replaced.DecRef(ctx) 1485 return unix.EBUSY 1486 } 1487 1488 // Require that a directory is replaced by a directory. 1489 oldIsDir := IsDir(renamed.Inode.StableAttr) 1490 newIsDir := IsDir(replaced.Inode.StableAttr) 1491 if !newIsDir && oldIsDir { 1492 replaced.DecRef(ctx) 1493 return unix.ENOTDIR 1494 } 1495 if !oldIsDir && newIsDir { 1496 replaced.DecRef(ctx) 1497 return unix.EISDIR 1498 } 1499 1500 // Allow the file system to drop extra references on replaced. 1501 replaced.dropExtendedReference() 1502 1503 // NOTE(b/31798319,b/31867149,b/31867671): Keeping a dirent 1504 // open across renames is currently broken for multiple 1505 // reasons, so we flush all references on the replaced node and 1506 // its children. 1507 replaced.Inode.Watches.Unpin(ctx, replaced) 1508 replaced.mu.Lock() 1509 replaced.flush(ctx) 1510 replaced.mu.Unlock() 1511 1512 // Done with replaced. 1513 replaced.DecRef(ctx) 1514 } 1515 1516 if err := renamed.Inode.Rename(ctx, oldParent, renamed, newParent, newName, replaced != nil); err != nil { 1517 return err 1518 } 1519 1520 renamed.name = newName 1521 renamed.parent = newParent 1522 if oldParent != newParent { 1523 // Reparent the reference held by renamed.parent. oldParent.DecRef 1524 // can't destroy oldParent (and try to retake its lock) because 1525 // Rename's caller must be holding a reference. 1526 newParent.IncRef() 1527 oldParent.DecRef(ctx) 1528 } 1529 if w, ok := newParent.children[newName]; ok { 1530 w.Drop(ctx) 1531 delete(newParent.children, newName) 1532 } 1533 if w, ok := oldParent.children[oldName]; ok { 1534 w.Drop(ctx) 1535 delete(oldParent.children, oldName) 1536 } 1537 1538 // Add a weak reference from the new parent. This ensures that the child 1539 // can still be found from the new parent if a prior hard reference is 1540 // held on renamed. 1541 // 1542 // This is required for file lock correctness because file locks are per-Dirent 1543 // and without maintaining the a cached child (via a weak reference) for renamed, 1544 // multiple Dirents can correspond to the same resource (by virtue of the renamed 1545 // Dirent being unreachable by its parent and it being looked up). 1546 newParent.children[newName] = refs.NewWeakRef(renamed, nil) 1547 1548 // Queue inotify events for the rename. 1549 var ev uint32 1550 if IsDir(renamed.Inode.StableAttr) { 1551 ev |= linux.IN_ISDIR 1552 } 1553 1554 cookie := uniqueid.InotifyCookie(ctx) 1555 oldParent.Inode.Watches.Notify(oldName, ev|linux.IN_MOVED_FROM, cookie) 1556 newParent.Inode.Watches.Notify(newName, ev|linux.IN_MOVED_TO, cookie) 1557 // Somewhat surprisingly, self move events do not have a cookie. 1558 renamed.Inode.Watches.Notify("", linux.IN_MOVE_SELF, 0) 1559 1560 // Allow the file system to drop extra references on renamed. 1561 renamed.dropExtendedReference() 1562 1563 // Same as replaced.flush above. 1564 renamed.mu.Lock() 1565 renamed.flush(ctx) 1566 renamed.mu.Unlock() 1567 1568 return nil 1569 }