github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/fsimpl/gofer/filesystem.go (about) 1 // Copyright 2019 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package gofer 16 17 import ( 18 "fmt" 19 "math" 20 "strings" 21 "sync" 22 23 "golang.org/x/sys/unix" 24 "github.com/metacubex/gvisor/pkg/abi/linux" 25 "github.com/metacubex/gvisor/pkg/atomicbitops" 26 "github.com/metacubex/gvisor/pkg/context" 27 "github.com/metacubex/gvisor/pkg/errors/linuxerr" 28 "github.com/metacubex/gvisor/pkg/fspath" 29 "github.com/metacubex/gvisor/pkg/refs" 30 "github.com/metacubex/gvisor/pkg/sentry/fsimpl/host" 31 "github.com/metacubex/gvisor/pkg/sentry/fsmetric" 32 "github.com/metacubex/gvisor/pkg/sentry/kernel" 33 "github.com/metacubex/gvisor/pkg/sentry/kernel/auth" 34 "github.com/metacubex/gvisor/pkg/sentry/kernel/pipe" 35 "github.com/metacubex/gvisor/pkg/sentry/socket/unix/transport" 36 "github.com/metacubex/gvisor/pkg/sentry/vfs" 37 ) 38 39 // Sync implements vfs.FilesystemImpl.Sync. 40 func (fs *filesystem) Sync(ctx context.Context) error { 41 // Snapshot current syncable dentries and special file FDs. 42 fs.syncMu.Lock() 43 ds := make([]*dentry, 0, fs.syncableDentries.Len()) 44 for elem := fs.syncableDentries.Front(); elem != nil; elem = elem.Next() { 45 ds = append(ds, elem.d) 46 } 47 sffds := make([]*specialFileFD, 0, fs.specialFileFDs.Len()) 48 for sffd := fs.specialFileFDs.Front(); sffd != nil; sffd = sffd.Next() { 49 sffds = append(sffds, sffd) 50 } 51 fs.syncMu.Unlock() 52 53 // Return the first error we encounter, but sync everything we can 54 // regardless. 55 var retErr error 56 57 // Note that lisafs is capable of batching FSync RPCs. However, we can not 58 // batch all the FDIDs to be synced from ds and sffds. Because the error 59 // handling varies based on file type. FSync errors are only considered for 60 // regular file FDIDs that were opened for writing. We could do individual 61 // RPCs for such FDIDs and batch the rest, but it increases code complexity 62 // substantially. We could implement it in the future if need be. 63 64 // Sync syncable dentries. 65 for _, d := range ds { 66 if err := d.syncCachedFile(ctx, true /* forFilesystemSync */); err != nil { 67 ctx.Infof("gofer.filesystem.Sync: dentry.syncCachedFile failed: %v", err) 68 if retErr == nil { 69 retErr = err 70 } 71 } 72 } 73 74 // Sync special files, which may be writable but do not use dentry shared 75 // handles (so they won't be synced by the above). 76 for _, sffd := range sffds { 77 if err := sffd.sync(ctx, true /* forFilesystemSync */); err != nil { 78 ctx.Infof("gofer.filesystem.Sync: specialFileFD.sync failed: %v", err) 79 if retErr == nil { 80 retErr = err 81 } 82 } 83 } 84 85 return retErr 86 } 87 88 // MaxFilenameLen is the maximum length of a filename. This is dictated by 9P's 89 // encoding of strings, which uses 2 bytes for the length prefix. 90 const MaxFilenameLen = (1 << 16) - 1 91 92 // dentrySlicePool is a pool of *[]*dentry used to store dentries for which 93 // dentry.checkCachingLocked() must be called. The pool holds pointers to 94 // slices because Go lacks generics, so sync.Pool operates on any, so 95 // every call to (what should be) sync.Pool<[]*dentry>.Put() allocates a copy 96 // of the slice header on the heap. 97 var dentrySlicePool = sync.Pool{ 98 New: func() any { 99 ds := make([]*dentry, 0, 4) // arbitrary non-zero initial capacity 100 return &ds 101 }, 102 } 103 104 func appendDentry(ds *[]*dentry, d *dentry) *[]*dentry { 105 if ds == nil { 106 ds = dentrySlicePool.Get().(*[]*dentry) 107 } 108 *ds = append(*ds, d) 109 return ds 110 } 111 112 // Precondition: !parent.isSynthetic() && !child.isSynthetic(). 113 func appendNewChildDentry(ds **[]*dentry, parent *dentry, child *dentry) { 114 // The new child was added to parent and took a ref on the parent (hence 115 // parent can be removed from cache). A new child has 0 refs for now. So 116 // checkCachingLocked() should be called on both. Call it first on the parent 117 // as it may create space in the cache for child to be inserted - hence 118 // avoiding a cache eviction. 119 *ds = appendDentry(*ds, parent) 120 *ds = appendDentry(*ds, child) 121 } 122 123 // Preconditions: ds != nil. 124 func putDentrySlice(ds *[]*dentry) { 125 // Allow dentries to be GC'd. 126 for i := range *ds { 127 (*ds)[i] = nil 128 } 129 *ds = (*ds)[:0] 130 dentrySlicePool.Put(ds) 131 } 132 133 // renameMuRUnlockAndCheckCaching calls fs.renameMu.RUnlock(), then calls 134 // dentry.checkCachingLocked on all dentries in *dsp with fs.renameMu locked 135 // for writing. 136 // 137 // dsp is a pointer-to-pointer since defer evaluates its arguments immediately, 138 // but dentry slices are allocated lazily, and it's much easier to say "defer 139 // fs.renameMuRUnlockAndCheckCaching(&ds)" than "defer func() { 140 // fs.renameMuRUnlockAndCheckCaching(ds) }()" to work around this. 141 // +checklocksreleaseread:fs.renameMu 142 func (fs *filesystem) renameMuRUnlockAndCheckCaching(ctx context.Context, dsp **[]*dentry) { 143 fs.renameMu.RUnlock() 144 if *dsp == nil { 145 return 146 } 147 ds := **dsp 148 for _, d := range ds { 149 d.checkCachingLocked(ctx, false /* renameMuWriteLocked */) 150 } 151 putDentrySlice(*dsp) 152 } 153 154 // +checklocksrelease:fs.renameMu 155 func (fs *filesystem) renameMuUnlockAndCheckCaching(ctx context.Context, ds **[]*dentry) { 156 if *ds == nil { 157 fs.renameMu.Unlock() 158 return 159 } 160 for _, d := range **ds { 161 d.checkCachingLocked(ctx, true /* renameMuWriteLocked */) 162 } 163 fs.renameMu.Unlock() 164 putDentrySlice(*ds) 165 } 166 167 // stepLocked resolves rp.Component() to an existing file, starting from the 168 // given directory. 169 // 170 // Dentries which may become cached as a result of the traversal are appended 171 // to *ds. 172 // 173 // Preconditions: 174 // - fs.renameMu must be locked. 175 // - d.opMu must be locked for reading. 176 // - !rp.Done(). 177 // - If !d.cachedMetadataAuthoritative(), then d and all children that are 178 // part of rp must have been revalidated. 179 // 180 // +checklocksread:d.opMu 181 func (fs *filesystem) stepLocked(ctx context.Context, rp resolvingPath, d *dentry, mayFollowSymlinks bool, ds **[]*dentry) (*dentry, bool, error) { 182 if !d.isDir() { 183 return nil, false, linuxerr.ENOTDIR 184 } 185 if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { 186 return nil, false, err 187 } 188 name := rp.Component() 189 if name == "." { 190 rp.Advance() 191 return d, false, nil 192 } 193 if name == ".." { 194 if isRoot, err := rp.CheckRoot(ctx, &d.vfsd); err != nil { 195 return nil, false, err 196 } else if isRoot || d.parent.Load() == nil { 197 rp.Advance() 198 return d, false, nil 199 } 200 if err := rp.CheckMount(ctx, &d.parent.Load().vfsd); err != nil { 201 return nil, false, err 202 } 203 rp.Advance() 204 return d.parent.Load(), false, nil 205 } 206 child, err := fs.getChildAndWalkPathLocked(ctx, d, rp, ds) 207 if err != nil { 208 return nil, false, err 209 } 210 if err := rp.CheckMount(ctx, &child.vfsd); err != nil { 211 return nil, false, err 212 } 213 if child.isSymlink() && mayFollowSymlinks && rp.ShouldFollowSymlink() { 214 target, err := child.readlink(ctx, rp.Mount()) 215 if err != nil { 216 return nil, false, err 217 } 218 followedSymlink, err := rp.HandleSymlink(target) 219 return d, followedSymlink, err 220 } 221 rp.Advance() 222 return child, false, nil 223 } 224 225 // getChildLocked returns a dentry representing the child of parent with the 226 // given name. Returns ENOENT if the child doesn't exist. 227 // 228 // Preconditions: 229 // - fs.renameMu must be locked. 230 // - parent.opMu must be locked. 231 // - parent.isDir(). 232 // - name is not "." or "..". 233 // - parent and the dentry at name have been revalidated. 234 // 235 // +checklocks:parent.opMu 236 func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name string, ds **[]*dentry) (*dentry, error) { 237 if child, err := parent.getCachedChildLocked(name); child != nil || err != nil { 238 return child, err 239 } 240 // We don't need to check for race here because parent.opMu is held for 241 // writing. 242 return fs.getRemoteChildLocked(ctx, parent, name, false /* checkForRace */, ds) 243 } 244 245 // getRemoteChildLocked is similar to getChildLocked, with the additional 246 // precondition that the child identified by name does not exist in cache. 247 // 248 // If checkForRace argument is true, then this method will check to see if the 249 // call has raced with another getRemoteChild call, and will handle the race if 250 // so. 251 // 252 // Preconditions: 253 // - If checkForRace is false, then parent.opMu must be held for writing. 254 // - Otherwise, parent.opMu must be held for reading. 255 // 256 // Postcondition: The returned dentry is already cached appropriately. 257 // 258 // +checklocksread:parent.opMu 259 func (fs *filesystem) getRemoteChildLocked(ctx context.Context, parent *dentry, name string, checkForRace bool, ds **[]*dentry) (*dentry, error) { 260 child, err := parent.getRemoteChild(ctx, name) 261 // Cache the result appropriately in the dentry tree. 262 if err != nil { 263 if linuxerr.Equals(linuxerr.ENOENT, err) { 264 parent.childrenMu.Lock() 265 defer parent.childrenMu.Unlock() 266 parent.cacheNegativeLookupLocked(name) 267 } 268 return nil, err 269 } 270 271 parent.childrenMu.Lock() 272 defer parent.childrenMu.Unlock() 273 274 if checkForRace { 275 // See if we raced with another getRemoteChild call that added 276 // to the cache. 277 if cachedChild, ok := parent.children[name]; ok && cachedChild != nil { 278 // We raced. Destroy our child and return the cached 279 // one. This child has no handles, no data, and has not 280 // been cached, so destruction is quick and painless. 281 child.destroyDisconnected(ctx) 282 283 // All good. Return the cached child. 284 return cachedChild, nil 285 } 286 // No race, continue with the child we got. 287 } 288 parent.cacheNewChildLocked(child, name) 289 appendNewChildDentry(ds, parent, child) 290 return child, nil 291 } 292 293 // getChildAndWalkPathLocked is the same as getChildLocked, except that it 294 // may prefetch the entire path represented by rp. 295 // 296 // +checklocksread:parent.opMu 297 func (fs *filesystem) getChildAndWalkPathLocked(ctx context.Context, parent *dentry, rp resolvingPath, ds **[]*dentry) (*dentry, error) { 298 if child, err := parent.getCachedChildLocked(rp.Component()); child != nil || err != nil { 299 return child, err 300 } 301 // dentry.getRemoteChildAndWalkPathLocked already handles dentry caching. 302 return parent.getRemoteChildAndWalkPathLocked(ctx, rp, ds) 303 } 304 305 // getCachedChildLocked returns a child dentry if it was cached earlier. If no 306 // cached child dentry exists, (nil, nil) is returned. 307 // 308 // Preconditions: 309 // - fs.renameMu must be locked. 310 // - d.opMu must be locked for reading. 311 // - d.isDir(). 312 // - name is not "." or "..". 313 // - d and the dentry at name have been revalidated. 314 // 315 // +checklocksread:d.opMu 316 func (d *dentry) getCachedChildLocked(name string) (*dentry, error) { 317 if len(name) > MaxFilenameLen { 318 return nil, linuxerr.ENAMETOOLONG 319 } 320 d.childrenMu.Lock() 321 defer d.childrenMu.Unlock() 322 if child, ok := d.children[name]; ok || d.isSynthetic() { 323 if child == nil { 324 return nil, linuxerr.ENOENT 325 } 326 return child, nil 327 } 328 329 if d.childrenSet != nil { 330 // Is the child even there? Don't make RPC if not. 331 if _, ok := d.childrenSet[name]; !ok { 332 return nil, linuxerr.ENOENT 333 } 334 } 335 return nil, nil 336 } 337 338 // walkParentDirLocked resolves all but the last path component of rp to an 339 // existing directory, starting from the given directory (which is usually 340 // rp.Start().Impl().(*dentry)). It does not check that the returned directory 341 // is searchable by the provider of rp. 342 // 343 // Preconditions: 344 // - fs.renameMu must be locked. 345 // - !rp.Done(). 346 // - If !d.cachedMetadataAuthoritative(), then d's cached metadata must be up 347 // to date. 348 func (fs *filesystem) walkParentDirLocked(ctx context.Context, vfsRP *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) { 349 rp := resolvingPathParent(vfsRP) 350 if err := fs.revalidatePath(ctx, rp, d, ds); err != nil { 351 return nil, err 352 } 353 for !rp.done() { 354 d.opMu.RLock() 355 next, followedSymlink, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds) 356 d.opMu.RUnlock() 357 if err != nil { 358 return nil, err 359 } 360 d = next 361 if followedSymlink { 362 if err := fs.revalidatePath(ctx, rp, d, ds); err != nil { 363 return nil, err 364 } 365 } 366 } 367 if !d.isDir() { 368 return nil, linuxerr.ENOTDIR 369 } 370 return d, nil 371 } 372 373 // resolveLocked resolves rp to an existing file. 374 // 375 // Preconditions: fs.renameMu must be locked. 376 func (fs *filesystem) resolveLocked(ctx context.Context, vfsRP *vfs.ResolvingPath, ds **[]*dentry) (*dentry, error) { 377 rp := resolvingPathFull(vfsRP) 378 d := rp.Start().Impl().(*dentry) 379 if err := fs.revalidatePath(ctx, rp, d, ds); err != nil { 380 return nil, err 381 } 382 for !rp.done() { 383 d.opMu.RLock() 384 next, followedSymlink, err := fs.stepLocked(ctx, rp, d, true /* mayFollowSymlinks */, ds) 385 d.opMu.RUnlock() 386 if err != nil { 387 return nil, err 388 } 389 d = next 390 if followedSymlink { 391 if err := fs.revalidatePath(ctx, rp, d, ds); err != nil { 392 return nil, err 393 } 394 } 395 } 396 if rp.MustBeDir() && !d.isDir() { 397 return nil, linuxerr.ENOTDIR 398 } 399 return d, nil 400 } 401 402 // doCreateAt checks that creating a file at rp is permitted, then invokes 403 // createInRemoteDir (if the parent directory is a real remote directory) or 404 // createInSyntheticDir (if the parent directory is synthetic) to do so. 405 // 406 // Preconditions: 407 // - !rp.Done(). 408 // - For the final path component in rp, !rp.ShouldFollowSymlink(). 409 func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, createInRemoteDir func(parent *dentry, name string, ds **[]*dentry) (*dentry, error), createInSyntheticDir func(parent *dentry, name string) (*dentry, error)) error { 410 var ds *[]*dentry 411 fs.renameMu.RLock() 412 defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) 413 start := rp.Start().Impl().(*dentry) 414 parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds) 415 if err != nil { 416 return err 417 } 418 419 // Order of checks is important. First check if parent directory can be 420 // executed, then check for existence, and lastly check if mount is writable. 421 if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { 422 return err 423 } 424 name := rp.Component() 425 if name == "." || name == ".." { 426 return linuxerr.EEXIST 427 } 428 if parent.isDeleted() { 429 return linuxerr.ENOENT 430 } 431 if err := fs.revalidateOne(ctx, rp.VirtualFilesystem(), parent, name, &ds); err != nil { 432 return err 433 } 434 435 parent.opMu.Lock() 436 defer parent.opMu.Unlock() 437 438 if len(name) > MaxFilenameLen { 439 return linuxerr.ENAMETOOLONG 440 } 441 // Check for existence only if caching information is available. Otherwise, 442 // don't check for existence just yet. We will check for existence if the 443 // checks for writability fail below. Existence check is done by the creation 444 // RPCs themselves. 445 parent.childrenMu.Lock() 446 if child, ok := parent.children[name]; ok && child != nil { 447 parent.childrenMu.Unlock() 448 return linuxerr.EEXIST 449 } 450 if parent.childrenSet != nil { 451 if _, ok := parent.childrenSet[name]; ok { 452 parent.childrenMu.Unlock() 453 return linuxerr.EEXIST 454 } 455 } 456 parent.childrenMu.Unlock() 457 checkExistence := func() error { 458 if child, err := fs.getChildLocked(ctx, parent, name, &ds); err != nil && !linuxerr.Equals(linuxerr.ENOENT, err) { 459 return err 460 } else if child != nil { 461 return linuxerr.EEXIST 462 } 463 return nil 464 } 465 466 mnt := rp.Mount() 467 if err := mnt.CheckBeginWrite(); err != nil { 468 // Existence check takes precedence. 469 if existenceErr := checkExistence(); existenceErr != nil { 470 return existenceErr 471 } 472 return err 473 } 474 defer mnt.EndWrite() 475 476 if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { 477 // Existence check takes precedence. 478 if existenceErr := checkExistence(); existenceErr != nil { 479 return existenceErr 480 } 481 return err 482 } 483 if !dir && rp.MustBeDir() { 484 return linuxerr.ENOENT 485 } 486 if parent.isSynthetic() { 487 if createInSyntheticDir == nil { 488 return linuxerr.EPERM 489 } 490 child, err := createInSyntheticDir(parent, name) 491 if err != nil { 492 return err 493 } 494 parent.childrenMu.Lock() 495 parent.cacheNewChildLocked(child, name) 496 parent.syntheticChildren++ 497 parent.clearDirentsLocked() 498 parent.childrenMu.Unlock() 499 parent.touchCMtime() 500 ev := linux.IN_CREATE 501 if dir { 502 ev |= linux.IN_ISDIR 503 } 504 parent.watches.Notify(ctx, name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */) 505 return nil 506 } 507 // No cached dentry exists; however, in InteropModeShared there might still be 508 // an existing file at name. Just attempt the file creation RPC anyways. If a 509 // file does exist, the RPC will fail with EEXIST like we would have. 510 child, err := createInRemoteDir(parent, name, &ds) 511 if err != nil { 512 return err 513 } 514 parent.childrenMu.Lock() 515 parent.cacheNewChildLocked(child, name) 516 if child.isSynthetic() { 517 parent.syntheticChildren++ 518 ds = appendDentry(ds, parent) 519 } else { 520 appendNewChildDentry(&ds, parent, child) 521 } 522 if fs.opts.interop != InteropModeShared { 523 if child, ok := parent.children[name]; ok && child == nil { 524 // Delete the now-stale negative dentry. 525 delete(parent.children, name) 526 parent.negativeChildren-- 527 } 528 parent.clearDirentsLocked() 529 parent.touchCMtime() 530 } 531 parent.childrenMu.Unlock() 532 ev := linux.IN_CREATE 533 if dir { 534 ev |= linux.IN_ISDIR 535 } 536 parent.watches.Notify(ctx, name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */) 537 return nil 538 } 539 540 // Preconditions: !rp.Done(). 541 func (fs *filesystem) unlinkAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool) error { 542 var ds *[]*dentry 543 fs.renameMu.RLock() 544 // We need to DecRef outside of fs.renameMu because forgetting a dead 545 // mountpoint could result in this filesystem being released which acquires 546 // fs.renameMu. 547 var toDecRef []refs.RefCounter 548 defer func() { 549 for _, ref := range toDecRef { 550 ref.DecRef(ctx) 551 } 552 }() 553 defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) 554 start := rp.Start().Impl().(*dentry) 555 parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds) 556 if err != nil { 557 return err 558 } 559 if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { 560 return err 561 } 562 if err := rp.Mount().CheckBeginWrite(); err != nil { 563 return err 564 } 565 defer rp.Mount().EndWrite() 566 567 name := rp.Component() 568 if dir { 569 if name == "." { 570 return linuxerr.EINVAL 571 } 572 if name == ".." { 573 return linuxerr.ENOTEMPTY 574 } 575 } else { 576 if name == "." || name == ".." { 577 return linuxerr.EISDIR 578 } 579 } 580 581 vfsObj := rp.VirtualFilesystem() 582 if err := fs.revalidateOne(ctx, vfsObj, parent, rp.Component(), &ds); err != nil { 583 return err 584 } 585 586 mntns := vfs.MountNamespaceFromContext(ctx) 587 defer mntns.DecRef(ctx) 588 589 parent.opMu.Lock() 590 defer parent.opMu.Unlock() 591 592 parent.childrenMu.Lock() 593 if parent.childrenSet != nil { 594 if _, ok := parent.childrenSet[name]; !ok { 595 parent.childrenMu.Unlock() 596 return linuxerr.ENOENT 597 } 598 } 599 parent.childrenMu.Unlock() 600 601 // Load child if sticky bit is set because we need to determine whether 602 // deletion is allowed. 603 var child *dentry 604 if parent.mode.Load()&linux.ModeSticky == 0 { 605 var ok bool 606 parent.childrenMu.Lock() 607 child, ok = parent.children[name] 608 parent.childrenMu.Unlock() 609 if ok && child == nil { 610 // Hit a negative cached entry, child doesn't exist. 611 return linuxerr.ENOENT 612 } 613 } else { 614 child, _, err = fs.stepLocked(ctx, resolvingPathFull(rp), parent, false /* mayFollowSymlinks */, &ds) 615 if err != nil { 616 return err 617 } 618 if err := parent.mayDelete(rp.Credentials(), child); err != nil { 619 return err 620 } 621 } 622 623 // If a child dentry exists, prepare to delete it. This should fail if it is 624 // a mount point. We detect mount points by speculatively calling 625 // PrepareDeleteDentry, which fails if child is a mount point. 626 // 627 // Also note that if child is nil, then it can't be a mount point. 628 if child != nil { 629 // Hold child.childrenMu so we can check child.children and 630 // child.syntheticChildren. We don't access these fields until a bit later, 631 // but locking child.childrenMu after calling vfs.PrepareDeleteDentry() would 632 // create an inconsistent lock ordering between dentry.childrenMu and 633 // vfs.Dentry.mu (in the VFS lock order, it would make dentry.childrenMu both "a 634 // FilesystemImpl lock" and "a lock acquired by a FilesystemImpl between 635 // PrepareDeleteDentry and CommitDeleteDentry). To avoid this, lock 636 // child.childrenMu before calling PrepareDeleteDentry. 637 child.childrenMu.Lock() 638 defer child.childrenMu.Unlock() 639 if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil { 640 return err 641 } 642 } 643 flags := uint32(0) 644 // If a dentry exists, use it for best-effort checks on its deletability. 645 if dir { 646 if child != nil { 647 // child must be an empty directory. 648 if child.syntheticChildren != 0 { // +checklocksforce: child.childrenMu is held if child != nil. 649 // This is definitely not an empty directory, irrespective of 650 // fs.opts.interop. 651 vfsObj.AbortDeleteDentry(&child.vfsd) // +checklocksforce: PrepareDeleteDentry called if child != nil. 652 return linuxerr.ENOTEMPTY 653 } 654 // If InteropModeShared is in effect and the first call to 655 // PrepareDeleteDentry above succeeded, then child wasn't 656 // revalidated (so we can't expect its file type to be correct) and 657 // individually revalidating its children (to confirm that they 658 // still exist) would be a waste of time. 659 if child.cachedMetadataAuthoritative() { 660 if !child.isDir() { 661 vfsObj.AbortDeleteDentry(&child.vfsd) // +checklocksforce: see above. 662 return linuxerr.ENOTDIR 663 } 664 for _, grandchild := range child.children { // +checklocksforce: child.childrenMu is held if child != nil. 665 if grandchild != nil { 666 vfsObj.AbortDeleteDentry(&child.vfsd) // +checklocksforce: see above. 667 return linuxerr.ENOTEMPTY 668 } 669 } 670 } 671 } 672 flags = linux.AT_REMOVEDIR 673 } else { 674 // child must be a non-directory file. 675 if child != nil && child.isDir() { 676 vfsObj.AbortDeleteDentry(&child.vfsd) // +checklocksforce: see above. 677 return linuxerr.EISDIR 678 } 679 if rp.MustBeDir() { 680 if child != nil { 681 vfsObj.AbortDeleteDentry(&child.vfsd) // +checklocksforce: see above. 682 } 683 return linuxerr.ENOTDIR 684 } 685 } 686 if parent.isSynthetic() { 687 if child == nil { 688 return linuxerr.ENOENT 689 } 690 } else if child == nil || !child.isSynthetic() { 691 if err := parent.unlink(ctx, name, flags); err != nil { 692 if child != nil { 693 vfsObj.AbortDeleteDentry(&child.vfsd) // +checklocksforce: see above. 694 } 695 return err 696 } 697 } 698 699 // Generate inotify events for rmdir or unlink. 700 if dir { 701 parent.watches.Notify(ctx, name, linux.IN_DELETE|linux.IN_ISDIR, 0, vfs.InodeEvent, true /* unlinked */) 702 } else { 703 var cw *vfs.Watches 704 if child != nil { 705 cw = &child.watches 706 } 707 vfs.InotifyRemoveChild(ctx, cw, &parent.watches, name) 708 } 709 710 parent.childrenMu.Lock() 711 defer parent.childrenMu.Unlock() 712 713 if child != nil { 714 toDecRef = vfsObj.CommitDeleteDentry(ctx, &child.vfsd) // +checklocksforce: see above. 715 child.setDeleted() 716 if child.isSynthetic() { 717 parent.syntheticChildren-- 718 child.decRefNoCaching() 719 } 720 ds = appendDentry(ds, child) 721 } 722 parent.cacheNegativeLookupLocked(name) 723 if parent.cachedMetadataAuthoritative() { 724 parent.clearDirentsLocked() 725 parent.touchCMtime() 726 if dir { 727 parent.decLinks() 728 } 729 } 730 return nil 731 } 732 733 // AccessAt implements vfs.Filesystem.Impl.AccessAt. 734 func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error { 735 var ds *[]*dentry 736 fs.renameMu.RLock() 737 defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) 738 d, err := fs.resolveLocked(ctx, rp, &ds) 739 if err != nil { 740 return err 741 } 742 if err := d.checkPermissions(creds, ats); err != nil { 743 return err 744 } 745 if ats.MayWrite() && rp.Mount().ReadOnly() { 746 return linuxerr.EROFS 747 } 748 return nil 749 } 750 751 // GetDentryAt implements vfs.FilesystemImpl.GetDentryAt. 752 func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { 753 var ds *[]*dentry 754 fs.renameMu.RLock() 755 defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) 756 d, err := fs.resolveLocked(ctx, rp, &ds) 757 if err != nil { 758 return nil, err 759 } 760 if opts.CheckSearchable { 761 if !d.isDir() { 762 return nil, linuxerr.ENOTDIR 763 } 764 if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { 765 return nil, err 766 } 767 } 768 d.IncRef() 769 // Call d.checkCachingLocked() so it can be removed from the cache if needed. 770 ds = appendDentry(ds, d) 771 return &d.vfsd, nil 772 } 773 774 // GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt. 775 func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) { 776 var ds *[]*dentry 777 fs.renameMu.RLock() 778 defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) 779 start := rp.Start().Impl().(*dentry) 780 d, err := fs.walkParentDirLocked(ctx, rp, start, &ds) 781 if err != nil { 782 return nil, err 783 } 784 d.IncRef() 785 // Call d.checkCachingLocked() so it can be removed from the cache if needed. 786 ds = appendDentry(ds, d) 787 return &d.vfsd, nil 788 } 789 790 // LinkAt implements vfs.FilesystemImpl.LinkAt. 791 func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error { 792 err := fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string, ds **[]*dentry) (*dentry, error) { 793 if rp.Mount() != vd.Mount() { 794 return nil, linuxerr.EXDEV 795 } 796 d := vd.Dentry().Impl().(*dentry) 797 if d.isDir() { 798 return nil, linuxerr.EPERM 799 } 800 gid := auth.KGID(d.gid.Load()) 801 uid := auth.KUID(d.uid.Load()) 802 mode := linux.FileMode(d.mode.Load()) 803 if err := vfs.MayLink(rp.Credentials(), mode, uid, gid); err != nil { 804 return nil, err 805 } 806 if d.nlink.Load() == 0 { 807 return nil, linuxerr.ENOENT 808 } 809 if d.nlink.Load() == math.MaxUint32 { 810 return nil, linuxerr.EMLINK 811 } 812 if d.isSynthetic() { 813 // TODO(gvisor.dev/issue/6739): Add synthetic file hard link support. 814 return nil, linuxerr.EOPNOTSUPP 815 } 816 return parent.link(ctx, d, name) 817 }, nil) 818 819 if err == nil { 820 // Success! 821 vd.Dentry().Impl().(*dentry).incLinks() 822 } 823 return err 824 } 825 826 // MkdirAt implements vfs.FilesystemImpl.MkdirAt. 827 func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error { 828 creds := rp.Credentials() 829 return fs.doCreateAt(ctx, rp, true /* dir */, func(parent *dentry, name string, ds **[]*dentry) (*dentry, error) { 830 // If the parent is a setgid directory, use the parent's GID 831 // rather than the caller's and enable setgid. 832 kgid := creds.EffectiveKGID 833 mode := opts.Mode 834 if parent.mode.Load()&linux.S_ISGID != 0 { 835 kgid = auth.KGID(parent.gid.Load()) 836 mode |= linux.S_ISGID 837 } 838 839 child, err := parent.mkdir(ctx, name, mode, creds.EffectiveKUID, kgid) 840 if err == nil { 841 if fs.opts.interop != InteropModeShared { 842 parent.incLinks() 843 } 844 return child, nil 845 } 846 847 if !opts.ForSyntheticMountpoint || linuxerr.Equals(linuxerr.EEXIST, err) { 848 return nil, err 849 } 850 ctx.Infof("Failed to create remote directory %q: %v; falling back to synthetic directory", name, err) 851 child = fs.newSyntheticDentry(&createSyntheticOpts{ 852 name: name, 853 mode: linux.S_IFDIR | opts.Mode, 854 kuid: creds.EffectiveKUID, 855 kgid: creds.EffectiveKGID, 856 }) 857 if fs.opts.interop != InteropModeShared { 858 parent.incLinks() 859 } 860 return child, nil 861 }, func(parent *dentry, name string) (*dentry, error) { 862 if !opts.ForSyntheticMountpoint { 863 // Can't create non-synthetic files in synthetic directories. 864 return nil, linuxerr.EPERM 865 } 866 child := fs.newSyntheticDentry(&createSyntheticOpts{ 867 name: name, 868 mode: linux.S_IFDIR | opts.Mode, 869 kuid: creds.EffectiveKUID, 870 kgid: creds.EffectiveKGID, 871 }) 872 parent.incLinks() 873 return child, nil 874 }) 875 } 876 877 // MknodAt implements vfs.FilesystemImpl.MknodAt. 878 func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error { 879 return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string, ds **[]*dentry) (*dentry, error) { 880 creds := rp.Credentials() 881 if child, err := parent.mknod(ctx, name, creds, &opts); err == nil { 882 return child, nil 883 } else if !linuxerr.Equals(linuxerr.EPERM, err) { 884 return nil, err 885 } 886 887 // EPERM means that gofer does not allow creating a socket or pipe. Fallback 888 // to creating a synthetic one, i.e. one that is kept entirely in memory. 889 890 // Check that we're not overriding an existing file with a synthetic one. 891 _, _, err := fs.stepLocked(ctx, resolvingPathFull(rp), parent, false /* mayFollowSymlinks */, ds) // +checklocksforce: parent.opMu taken by doCreateAt. 892 switch { 893 case err == nil: 894 // Step succeeded, another file exists. 895 return nil, linuxerr.EEXIST 896 case !linuxerr.Equals(linuxerr.ENOENT, err): 897 // Schrödinger. File/Cat may or may not exist. 898 return nil, err 899 } 900 901 switch opts.Mode.FileType() { 902 case linux.S_IFSOCK: 903 return fs.newSyntheticDentry(&createSyntheticOpts{ 904 name: name, 905 mode: opts.Mode, 906 kuid: creds.EffectiveKUID, 907 kgid: creds.EffectiveKGID, 908 endpoint: opts.Endpoint, 909 }), nil 910 case linux.S_IFIFO: 911 return fs.newSyntheticDentry(&createSyntheticOpts{ 912 name: name, 913 mode: opts.Mode, 914 kuid: creds.EffectiveKUID, 915 kgid: creds.EffectiveKGID, 916 pipe: pipe.NewVFSPipe(true /* isNamed */, pipe.DefaultPipeSize), 917 }), nil 918 } 919 // Retain error from gofer if synthetic file cannot be created internally. 920 return nil, linuxerr.EPERM 921 }, nil) 922 } 923 924 // OpenAt implements vfs.FilesystemImpl.OpenAt. 925 func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) { 926 // Reject O_TMPFILE, which is not supported; supporting it correctly in the 927 // presence of other remote filesystem users requires remote filesystem 928 // support, and it isn't clear that there's any way to implement this in 929 // 9P. 930 if opts.Flags&linux.O_TMPFILE != 0 { 931 return nil, linuxerr.EOPNOTSUPP 932 } 933 mayCreate := opts.Flags&linux.O_CREAT != 0 934 mustCreate := opts.Flags&(linux.O_CREAT|linux.O_EXCL) == (linux.O_CREAT | linux.O_EXCL) 935 936 var ds *[]*dentry 937 fs.renameMu.RLock() 938 unlocked := false 939 unlock := func() { 940 if !unlocked { 941 fs.renameMuRUnlockAndCheckCaching(ctx, &ds) 942 unlocked = true 943 } 944 } 945 defer unlock() 946 947 start := rp.Start().Impl().(*dentry) 948 if rp.Done() { 949 // Reject attempts to open mount root directory with O_CREAT. 950 if mayCreate && rp.MustBeDir() { 951 return nil, linuxerr.EISDIR 952 } 953 if mustCreate { 954 return nil, linuxerr.EEXIST 955 } 956 if !start.cachedMetadataAuthoritative() { 957 // Refresh dentry's attributes before opening. 958 if err := start.updateMetadata(ctx); err != nil { 959 return nil, err 960 } 961 } 962 start.IncRef() 963 defer start.DecRef(ctx) 964 unlock() 965 // start is intentionally not added to ds (which would remove it from the 966 // cache) because doing so regresses performance in practice. 967 return start.open(ctx, rp, &opts) 968 } 969 970 afterTrailingSymlink: 971 parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds) 972 if err != nil { 973 return nil, err 974 } 975 // Check for search permission in the parent directory. 976 if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { 977 return nil, err 978 } 979 // Reject attempts to open directories with O_CREAT. 980 if mayCreate && rp.MustBeDir() { 981 return nil, linuxerr.EISDIR 982 } 983 if err := fs.revalidateOne(ctx, rp.VirtualFilesystem(), parent, rp.Component(), &ds); err != nil { 984 return nil, err 985 } 986 // Determine whether or not we need to create a file. 987 // NOTE(b/263297063): Don't hold opMu for writing here, to avoid 988 // serializing OpenAt calls in the same directory in the common case 989 // that the file exists. 990 parent.opMu.RLock() 991 child, followedSymlink, err := fs.stepLocked(ctx, resolvingPathFull(rp), parent, true /* mayFollowSymlinks */, &ds) 992 parent.opMu.RUnlock() 993 if followedSymlink { 994 if mustCreate { 995 // EEXIST must be returned if an existing symlink is opened with O_EXCL. 996 return nil, linuxerr.EEXIST 997 } 998 if err != nil { 999 // If followedSymlink && err != nil, then this symlink resolution error 1000 // must be handled by the VFS layer. 1001 return nil, err 1002 } 1003 start = parent 1004 goto afterTrailingSymlink 1005 } 1006 if linuxerr.Equals(linuxerr.ENOENT, err) && mayCreate { 1007 if parent.isSynthetic() { 1008 return nil, linuxerr.EPERM 1009 } 1010 1011 // Take opMu for writing, but note that the file may have been 1012 // created by another goroutine since we checked for existence 1013 // a few lines ago. We must handle that case. 1014 parent.opMu.Lock() 1015 fd, createErr := parent.createAndOpenChildLocked(ctx, rp, &opts, &ds) 1016 if !linuxerr.Equals(linuxerr.EEXIST, createErr) { 1017 // Either the creation was a success, or we got an 1018 // unexpected error. Either way we can return here. 1019 parent.opMu.Unlock() 1020 return fd, createErr 1021 } 1022 1023 // We raced, and now the file exists. 1024 if mustCreate { 1025 parent.opMu.Unlock() 1026 return nil, linuxerr.EEXIST 1027 } 1028 1029 // Step to the file again. Since we still hold opMu for 1030 // writing, there can't be a race here. 1031 child, _, err = fs.stepLocked(ctx, resolvingPathFull(rp), parent, false /* mayFollowSymlinks */, &ds) 1032 parent.opMu.Unlock() 1033 } 1034 if err != nil { 1035 return nil, err 1036 } 1037 if mustCreate { 1038 return nil, linuxerr.EEXIST 1039 } 1040 if rp.MustBeDir() && !child.isDir() { 1041 return nil, linuxerr.ENOTDIR 1042 } 1043 child.IncRef() 1044 defer child.DecRef(ctx) 1045 unlock() 1046 // child is intentionally not added to ds (which would remove it from the 1047 // cache) because doing so regresses performance in practice. 1048 return child.open(ctx, rp, &opts) 1049 } 1050 1051 // Preconditions: The caller must hold no locks (since opening pipes may block 1052 // indefinitely). 1053 func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { 1054 ats := vfs.AccessTypesForOpenFlags(opts) 1055 if err := d.checkPermissions(rp.Credentials(), ats); err != nil { 1056 return nil, err 1057 } 1058 1059 if !d.isSynthetic() { 1060 // renameMu is locked here because it is required by d.openHandle(), which 1061 // is called by d.ensureSharedHandle() and d.openSpecialFile() below. It is 1062 // also required by d.connect() which is called by 1063 // d.openSocketByConnecting(). Note that opening non-synthetic pipes may 1064 // block, renameMu is unlocked separately in d.openSpecialFile() for pipes. 1065 d.fs.renameMu.RLock() 1066 defer d.fs.renameMu.RUnlock() 1067 } 1068 1069 trunc := opts.Flags&linux.O_TRUNC != 0 && d.fileType() == linux.S_IFREG 1070 if trunc { 1071 // Lock metadataMu *while* we open a regular file with O_TRUNC because 1072 // open(2) will change the file size on server. 1073 d.metadataMu.Lock() 1074 defer d.metadataMu.Unlock() 1075 } 1076 1077 var vfd *vfs.FileDescription 1078 var err error 1079 mnt := rp.Mount() 1080 switch d.fileType() { 1081 case linux.S_IFREG: 1082 if !d.fs.opts.regularFilesUseSpecialFileFD { 1083 if err := d.ensureSharedHandle(ctx, ats.MayRead(), ats.MayWrite(), trunc); err != nil { 1084 return nil, err 1085 } 1086 fd, err := newRegularFileFD(mnt, d, opts.Flags) 1087 if err != nil { 1088 return nil, err 1089 } 1090 vfd = &fd.vfsfd 1091 } 1092 case linux.S_IFDIR: 1093 // Can't open directories with O_CREAT. 1094 if opts.Flags&linux.O_CREAT != 0 { 1095 return nil, linuxerr.EISDIR 1096 } 1097 // Can't open directories writably. 1098 if ats&vfs.MayWrite != 0 { 1099 return nil, linuxerr.EISDIR 1100 } 1101 if opts.Flags&linux.O_DIRECT != 0 { 1102 return nil, linuxerr.EINVAL 1103 } 1104 if !d.isSynthetic() { 1105 if err := d.ensureSharedHandle(ctx, ats&vfs.MayRead != 0, false /* write */, false /* trunc */); err != nil { 1106 return nil, err 1107 } 1108 } 1109 fd := &directoryFD{} 1110 fd.LockFD.Init(&d.locks) 1111 if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { 1112 return nil, err 1113 } 1114 if d.readFD.Load() >= 0 { 1115 fsmetric.GoferOpensHost.Increment() 1116 } else { 1117 fsmetric.GoferOpens9P.Increment() 1118 } 1119 return &fd.vfsfd, nil 1120 case linux.S_IFLNK: 1121 // Can't open symlinks without O_PATH, which is handled at the VFS layer. 1122 return nil, linuxerr.ELOOP 1123 case linux.S_IFSOCK: 1124 if d.isSynthetic() { 1125 return nil, linuxerr.ENXIO 1126 } 1127 if d.fs.iopts.OpenSocketsByConnecting { 1128 return d.openSocketByConnecting(ctx, opts) 1129 } 1130 case linux.S_IFIFO: 1131 if d.isSynthetic() { 1132 return d.pipe.Open(ctx, mnt, &d.vfsd, opts.Flags, &d.locks) 1133 } 1134 if d.fs.opts.disableFifoOpen { 1135 return nil, linuxerr.EPERM 1136 } 1137 } 1138 1139 if vfd == nil { 1140 if vfd, err = d.openSpecialFile(ctx, mnt, opts); err != nil { 1141 return nil, err 1142 } 1143 } 1144 1145 if trunc { 1146 // If no errors occurred so far then update file size in memory. This 1147 // step is required even if !d.cachedMetadataAuthoritative() because 1148 // d.mappings has to be updated. 1149 // d.metadataMu has already been acquired if trunc == true. 1150 d.updateSizeLocked(0) 1151 1152 if d.cachedMetadataAuthoritative() { 1153 d.touchCMtimeLocked() 1154 } 1155 } 1156 return vfd, err 1157 } 1158 1159 // Precondition: fs.renameMu is locked. 1160 func (d *dentry) openSocketByConnecting(ctx context.Context, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { 1161 if opts.Flags&linux.O_DIRECT != 0 { 1162 return nil, linuxerr.EINVAL 1163 } 1164 // Note that special value of linux.SockType = 0 is interpreted by lisafs 1165 // as "do not care about the socket type". Analogous to p9.AnonymousSocket. 1166 sockFD, err := d.connect(ctx, 0 /* sockType */) 1167 if err != nil { 1168 return nil, err 1169 } 1170 fd, err := host.NewFD(ctx, kernel.KernelFromContext(ctx).HostMount(), sockFD, &host.NewFDOptions{ 1171 HaveFlags: true, 1172 Flags: opts.Flags, 1173 }) 1174 if err != nil { 1175 unix.Close(sockFD) 1176 return nil, err 1177 } 1178 return fd, nil 1179 } 1180 1181 // Preconditions: 1182 // - !d.isSynthetic(). 1183 // - fs.renameMu is locked. It may be released temporarily while pipe blocks. 1184 // - If d is a pipe, no other locks (other than fs.renameMu) should be held. 1185 func (d *dentry) openSpecialFile(ctx context.Context, mnt *vfs.Mount, opts *vfs.OpenOptions) (*vfs.FileDescription, error) { 1186 ats := vfs.AccessTypesForOpenFlags(opts) 1187 if opts.Flags&linux.O_DIRECT != 0 && !d.isRegularFile() { 1188 return nil, linuxerr.EINVAL 1189 } 1190 // We assume that the server silently inserts O_NONBLOCK in the open flags 1191 // for all named pipes (because all existing gofers do this). 1192 // 1193 // NOTE(b/133875563): This makes named pipe opens racy, because the 1194 // mechanisms for translating nonblocking to blocking opens can only detect 1195 // the instantaneous presence of a peer holding the other end of the pipe 1196 // open, not whether the pipe was *previously* opened by a peer that has 1197 // since closed its end. 1198 isBlockingOpenOfNamedPipe := d.fileType() == linux.S_IFIFO && opts.Flags&linux.O_NONBLOCK == 0 1199 retry: 1200 h, err := d.openHandle(ctx, ats.MayRead(), ats.MayWrite(), opts.Flags&linux.O_TRUNC != 0) 1201 if err != nil { 1202 if isBlockingOpenOfNamedPipe && ats == vfs.MayWrite && linuxerr.Equals(linuxerr.ENXIO, err) { 1203 // An attempt to open a named pipe with O_WRONLY|O_NONBLOCK fails 1204 // with ENXIO if opening the same named pipe with O_WRONLY would 1205 // block because there are no readers of the pipe. Release renameMu 1206 // while blocking. 1207 d.fs.renameMu.RUnlock() 1208 err := sleepBetweenNamedPipeOpenChecks(ctx) 1209 d.fs.renameMu.RLock() 1210 if err != nil { 1211 return nil, err 1212 } 1213 goto retry 1214 } 1215 return nil, err 1216 } 1217 if isBlockingOpenOfNamedPipe && ats == vfs.MayRead && h.fd >= 0 { 1218 // Release renameMu while blocking. 1219 d.fs.renameMu.RUnlock() 1220 err := blockUntilNonblockingPipeHasWriter(ctx, h.fd) 1221 d.fs.renameMu.RLock() 1222 if err != nil { 1223 h.close(ctx) 1224 return nil, err 1225 } 1226 } 1227 fd, err := newSpecialFileFD(h, mnt, d, opts.Flags) 1228 if err != nil { 1229 h.close(ctx) 1230 return nil, err 1231 } 1232 return &fd.vfsfd, nil 1233 } 1234 1235 // Preconditions: 1236 // - d.fs.renameMu must be locked. 1237 // - d.opMu must be locked for writing. 1238 // - !d.isSynthetic(). 1239 // 1240 // +checklocks:d.opMu 1241 func (d *dentry) createAndOpenChildLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions, ds **[]*dentry) (*vfs.FileDescription, error) { 1242 if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { 1243 return nil, err 1244 } 1245 if d.isDeleted() { 1246 return nil, linuxerr.ENOENT 1247 } 1248 mnt := rp.Mount() 1249 if err := mnt.CheckBeginWrite(); err != nil { 1250 return nil, err 1251 } 1252 defer mnt.EndWrite() 1253 1254 creds := rp.Credentials() 1255 name := rp.Component() 1256 // If the parent is a setgid directory, use the parent's GID rather 1257 // than the caller's. 1258 kgid := creds.EffectiveKGID 1259 if d.mode.Load()&linux.S_ISGID != 0 { 1260 kgid = auth.KGID(d.gid.Load()) 1261 } 1262 1263 child, h, err := d.openCreate(ctx, name, opts.Flags&linux.O_ACCMODE, opts.Mode, creds.EffectiveKUID, kgid) 1264 if err != nil { 1265 return nil, err 1266 } 1267 1268 // Incorporate the fid that was opened by lcreate. 1269 useRegularFileFD := child.fileType() == linux.S_IFREG && !d.fs.opts.regularFilesUseSpecialFileFD 1270 if useRegularFileFD { 1271 var readable, writable bool 1272 child.handleMu.Lock() 1273 if vfs.MayReadFileWithOpenFlags(opts.Flags) { 1274 readable = true 1275 if h.fd != -1 { 1276 child.readFD = atomicbitops.FromInt32(h.fd) 1277 child.mmapFD = atomicbitops.FromInt32(h.fd) 1278 } 1279 } 1280 if vfs.MayWriteFileWithOpenFlags(opts.Flags) { 1281 writable = true 1282 child.writeFD = atomicbitops.FromInt32(h.fd) 1283 } 1284 child.updateHandles(ctx, h, readable, writable) 1285 child.handleMu.Unlock() 1286 } 1287 // Insert the dentry into the tree. 1288 d.childrenMu.Lock() 1289 // We have d.opMu for writing, so there can not be a cached child with 1290 // this name. We could not have raced. 1291 d.cacheNewChildLocked(child, name) 1292 appendNewChildDentry(ds, d, child) 1293 if d.cachedMetadataAuthoritative() { 1294 d.touchCMtime() 1295 d.clearDirentsLocked() 1296 } 1297 d.childrenMu.Unlock() 1298 1299 // Finally, construct a file description representing the created file. 1300 var childVFSFD *vfs.FileDescription 1301 if useRegularFileFD { 1302 fd, err := newRegularFileFD(mnt, child, opts.Flags) 1303 if err != nil { 1304 return nil, err 1305 } 1306 childVFSFD = &fd.vfsfd 1307 } else { 1308 fd, err := newSpecialFileFD(h, mnt, child, opts.Flags) 1309 if err != nil { 1310 h.close(ctx) 1311 return nil, err 1312 } 1313 childVFSFD = &fd.vfsfd 1314 } 1315 d.watches.Notify(ctx, name, linux.IN_CREATE, 0, vfs.PathEvent, false /* unlinked */) 1316 return childVFSFD, nil 1317 } 1318 1319 // ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt. 1320 func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) { 1321 var ds *[]*dentry 1322 fs.renameMu.RLock() 1323 defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) 1324 d, err := fs.resolveLocked(ctx, rp, &ds) 1325 if err != nil { 1326 return "", err 1327 } 1328 if !d.isSymlink() { 1329 return "", linuxerr.EINVAL 1330 } 1331 return d.readlink(ctx, rp.Mount()) 1332 } 1333 1334 // RenameAt implements vfs.FilesystemImpl.RenameAt. 1335 func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error { 1336 // Resolve newParent first to verify that it's on this Mount. 1337 var ds *[]*dentry 1338 fs.renameMu.Lock() 1339 // We need to DecRef outside of fs.mu because forgetting a dead mountpoint 1340 // could result in this filesystem being released which acquires fs.mu. 1341 var toDecRef []refs.RefCounter 1342 defer func() { 1343 for _, ref := range toDecRef { 1344 ref.DecRef(ctx) 1345 } 1346 }() 1347 defer fs.renameMuUnlockAndCheckCaching(ctx, &ds) 1348 newParent, err := fs.walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry), &ds) 1349 if err != nil { 1350 return err 1351 } 1352 1353 if opts.Flags&^linux.RENAME_NOREPLACE != 0 { 1354 return linuxerr.EINVAL 1355 } 1356 if fs.opts.interop == InteropModeShared && opts.Flags&linux.RENAME_NOREPLACE != 0 { 1357 // Requires 9P support to synchronize with other remote filesystem 1358 // users. 1359 return linuxerr.EINVAL 1360 } 1361 1362 newName := rp.Component() 1363 if newName == "." || newName == ".." { 1364 if opts.Flags&linux.RENAME_NOREPLACE != 0 { 1365 return linuxerr.EEXIST 1366 } 1367 return linuxerr.EBUSY 1368 } 1369 if len(newName) > MaxFilenameLen { 1370 return linuxerr.ENAMETOOLONG 1371 } 1372 mnt := rp.Mount() 1373 if mnt != oldParentVD.Mount() { 1374 return linuxerr.EXDEV 1375 } 1376 if err := mnt.CheckBeginWrite(); err != nil { 1377 return err 1378 } 1379 defer mnt.EndWrite() 1380 1381 oldParent := oldParentVD.Dentry().Impl().(*dentry) 1382 if !oldParent.cachedMetadataAuthoritative() { 1383 if err := oldParent.updateMetadata(ctx); err != nil { 1384 return err 1385 } 1386 } 1387 creds := rp.Credentials() 1388 if err := oldParent.checkPermissions(creds, vfs.MayWrite|vfs.MayExec); err != nil { 1389 return err 1390 } 1391 1392 vfsObj := rp.VirtualFilesystem() 1393 if err := fs.revalidateOne(ctx, vfsObj, newParent, newName, &ds); err != nil { 1394 return err 1395 } 1396 if err := fs.revalidateOne(ctx, vfsObj, oldParent, oldName, &ds); err != nil { 1397 return err 1398 } 1399 1400 // We need a dentry representing the renamed file since, if it's a 1401 // directory, we need to check for write permission on it. 1402 oldParent.opMu.Lock() 1403 defer oldParent.opMu.Unlock() 1404 renamed, err := fs.getChildLocked(ctx, oldParent, oldName, &ds) 1405 if err != nil { 1406 return err 1407 } 1408 if err := oldParent.mayDelete(creds, renamed); err != nil { 1409 return err 1410 } 1411 if renamed.isDir() { 1412 if renamed == newParent || genericIsAncestorDentry(renamed, newParent) { 1413 return linuxerr.EINVAL 1414 } 1415 if oldParent != newParent { 1416 if err := renamed.checkPermissions(creds, vfs.MayWrite); err != nil { 1417 return err 1418 } 1419 } 1420 } else { 1421 if opts.MustBeDir || rp.MustBeDir() { 1422 return linuxerr.ENOTDIR 1423 } 1424 } 1425 1426 if oldParent != newParent { 1427 if err := newParent.checkPermissions(creds, vfs.MayWrite|vfs.MayExec); err != nil { 1428 return err 1429 } 1430 newParent.opMu.Lock() 1431 defer newParent.opMu.Unlock() 1432 } 1433 if newParent.isDeleted() { 1434 return linuxerr.ENOENT 1435 } 1436 replaced, err := fs.getChildLocked(ctx, newParent, newName, &ds) // +checklocksforce: newParent.opMu taken if newParent != oldParent. 1437 if err != nil && !linuxerr.Equals(linuxerr.ENOENT, err) { 1438 return err 1439 } 1440 var replacedVFSD *vfs.Dentry 1441 if replaced != nil { 1442 if opts.Flags&linux.RENAME_NOREPLACE != 0 { 1443 return linuxerr.EEXIST 1444 } 1445 replacedVFSD = &replaced.vfsd 1446 if replaced.isDir() { 1447 if !renamed.isDir() { 1448 return linuxerr.EISDIR 1449 } 1450 if genericIsAncestorDentry(replaced, renamed) { 1451 return linuxerr.ENOTEMPTY 1452 } 1453 } else { 1454 if rp.MustBeDir() || renamed.isDir() { 1455 return linuxerr.ENOTDIR 1456 } 1457 } 1458 } 1459 1460 if oldParent == newParent && oldName == newName { 1461 return nil 1462 } 1463 mntns := vfs.MountNamespaceFromContext(ctx) 1464 defer mntns.DecRef(ctx) 1465 if err := vfsObj.PrepareRenameDentry(mntns, &renamed.vfsd, replacedVFSD); err != nil { 1466 return err 1467 } 1468 1469 // Update the remote filesystem. 1470 if !renamed.isSynthetic() { 1471 if err := oldParent.rename(ctx, oldName, newParent, newName); err != nil { 1472 vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD) 1473 return err 1474 } 1475 } else if replaced != nil && !replaced.isSynthetic() { 1476 // We are replacing an existing real file with a synthetic one, so we 1477 // need to unlink the former. 1478 flags := uint32(0) 1479 if replaced.isDir() { 1480 flags = linux.AT_REMOVEDIR 1481 } 1482 if err := newParent.unlink(ctx, newName, flags); err != nil { 1483 vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD) 1484 return err 1485 } 1486 } 1487 1488 // Update the dentry tree. 1489 newParent.childrenMu.Lock() 1490 defer newParent.childrenMu.Unlock() 1491 if oldParent != newParent { 1492 oldParent.childrenMu.Lock() 1493 defer oldParent.childrenMu.Unlock() 1494 } 1495 1496 toDecRef = vfsObj.CommitRenameReplaceDentry(ctx, &renamed.vfsd, replacedVFSD) 1497 if replaced != nil { 1498 replaced.setDeleted() 1499 if replaced.isSynthetic() { 1500 newParent.syntheticChildren-- 1501 replaced.decRefNoCaching() 1502 } 1503 ds = appendDentry(ds, replaced) 1504 // Remove the replaced entry from its parent's cache. 1505 delete(newParent.children, newName) 1506 } 1507 oldParent.cacheNegativeLookupLocked(oldName) // +checklocksforce: oldParent.childrenMu is held if oldParent != newParent. 1508 if renamed.isSynthetic() { 1509 oldParent.syntheticChildren-- 1510 newParent.syntheticChildren++ 1511 } 1512 // We have d.opMu for writing, so no need to check for existence of a 1513 // child with the given name. We could not have raced. 1514 newParent.cacheNewChildLocked(renamed, newName) 1515 oldParent.decRefNoCaching() 1516 if oldParent != newParent { 1517 ds = appendDentry(ds, newParent) 1518 ds = appendDentry(ds, oldParent) 1519 } 1520 1521 // Update metadata. 1522 if renamed.cachedMetadataAuthoritative() { 1523 renamed.touchCtime() 1524 } 1525 if oldParent.cachedMetadataAuthoritative() { 1526 oldParent.clearDirentsLocked() 1527 oldParent.touchCMtime() 1528 if renamed.isDir() { 1529 oldParent.decLinks() 1530 } 1531 } 1532 if newParent.cachedMetadataAuthoritative() { 1533 newParent.clearDirentsLocked() 1534 newParent.touchCMtime() 1535 if renamed.isDir() && (replaced == nil || !replaced.isDir()) { 1536 // Increase the link count if we did not replace another directory. 1537 newParent.incLinks() 1538 } 1539 } 1540 vfs.InotifyRename(ctx, &renamed.watches, &oldParent.watches, &newParent.watches, oldName, newName, renamed.isDir()) 1541 return nil 1542 } 1543 1544 // RmdirAt implements vfs.FilesystemImpl.RmdirAt. 1545 func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error { 1546 return fs.unlinkAt(ctx, rp, true /* dir */) 1547 } 1548 1549 // SetStatAt implements vfs.FilesystemImpl.SetStatAt. 1550 func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { 1551 var ds *[]*dentry 1552 fs.renameMu.RLock() 1553 d, err := fs.resolveLocked(ctx, rp, &ds) 1554 if err != nil { 1555 fs.renameMuRUnlockAndCheckCaching(ctx, &ds) 1556 return err 1557 } 1558 err = d.setStat(ctx, rp.Credentials(), &opts, rp.Mount()) 1559 fs.renameMuRUnlockAndCheckCaching(ctx, &ds) 1560 if err != nil { 1561 return err 1562 } 1563 1564 if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 { 1565 d.InotifyWithParent(ctx, ev, 0, vfs.InodeEvent) 1566 } 1567 return nil 1568 } 1569 1570 // StatAt implements vfs.FilesystemImpl.StatAt. 1571 func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) { 1572 var ds *[]*dentry 1573 fs.renameMu.RLock() 1574 defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) 1575 d, err := fs.resolveLocked(ctx, rp, &ds) 1576 if err != nil { 1577 return linux.Statx{}, err 1578 } 1579 // Since walking updates metadata for all traversed dentries under 1580 // InteropModeShared, including the returned one, we can return cached 1581 // metadata here regardless of fs.opts.interop. 1582 var stat linux.Statx 1583 d.statTo(&stat) 1584 return stat, nil 1585 } 1586 1587 // StatFSAt implements vfs.FilesystemImpl.StatFSAt. 1588 func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { 1589 var ds *[]*dentry 1590 fs.renameMu.RLock() 1591 defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) 1592 d, err := fs.resolveLocked(ctx, rp, &ds) 1593 if err != nil { 1594 return linux.Statfs{}, err 1595 } 1596 // If d is synthetic, invoke statfs on the first ancestor of d that isn't. 1597 for d.isSynthetic() { 1598 d = d.parent.Load() 1599 } 1600 statfs, err := d.statfs(ctx) 1601 if err != nil { 1602 return linux.Statfs{}, err 1603 } 1604 if statfs.NameLength == 0 || statfs.NameLength > MaxFilenameLen { 1605 statfs.NameLength = MaxFilenameLen 1606 } 1607 // This is primarily for distinguishing a gofer file system in 1608 // tests. Testing is important, so instead of defining 1609 // something completely random, use a standard value. 1610 statfs.Type = linux.V9FS_MAGIC 1611 return statfs, nil 1612 } 1613 1614 // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. 1615 func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error { 1616 return fs.doCreateAt(ctx, rp, false /* dir */, func(parent *dentry, name string, ds **[]*dentry) (*dentry, error) { 1617 child, err := parent.symlink(ctx, name, target, rp.Credentials()) 1618 if err != nil { 1619 return nil, err 1620 } 1621 if parent.fs.opts.interop != InteropModeShared { 1622 // Cache the symlink target on creation. In practice, this helps avoid a 1623 // lot of ReadLink RPCs. Note that when InteropModeShared is in effect, 1624 // we are forced to make Readlink RPCs. Because in this mode, we use host 1625 // timestamps, not timestamps based on our internal clock. And readlink 1626 // updates the atime on the host. 1627 child.haveTarget = true 1628 child.target = target 1629 } 1630 return child, nil 1631 }, nil) 1632 } 1633 1634 // UnlinkAt implements vfs.FilesystemImpl.UnlinkAt. 1635 func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error { 1636 return fs.unlinkAt(ctx, rp, false /* dir */) 1637 } 1638 1639 // BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt. 1640 func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) { 1641 var ds *[]*dentry 1642 fs.renameMu.RLock() 1643 defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) 1644 d, err := fs.resolveLocked(ctx, rp, &ds) 1645 if err != nil { 1646 return nil, err 1647 } 1648 if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { 1649 return nil, err 1650 } 1651 if !d.isSocket() { 1652 return nil, linuxerr.ECONNREFUSED 1653 } 1654 if d.endpoint != nil { 1655 return d.endpoint, nil 1656 } 1657 if !d.isSynthetic() { 1658 d.IncRef() 1659 ds = appendDentry(ds, d) 1660 return &endpoint{ 1661 dentry: d, 1662 path: opts.Addr, 1663 }, nil 1664 } 1665 return nil, linuxerr.ECONNREFUSED 1666 } 1667 1668 // ListXattrAt implements vfs.FilesystemImpl.ListXattrAt. 1669 func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { 1670 var ds *[]*dentry 1671 fs.renameMu.RLock() 1672 defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) 1673 d, err := fs.resolveLocked(ctx, rp, &ds) 1674 if err != nil { 1675 return nil, err 1676 } 1677 return d.listXattr(ctx, size) 1678 } 1679 1680 // GetXattrAt implements vfs.FilesystemImpl.GetXattrAt. 1681 func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) { 1682 var ds *[]*dentry 1683 fs.renameMu.RLock() 1684 defer fs.renameMuRUnlockAndCheckCaching(ctx, &ds) 1685 d, err := fs.resolveLocked(ctx, rp, &ds) 1686 if err != nil { 1687 return "", err 1688 } 1689 return d.getXattr(ctx, rp.Credentials(), &opts) 1690 } 1691 1692 // SetXattrAt implements vfs.FilesystemImpl.SetXattrAt. 1693 func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error { 1694 var ds *[]*dentry 1695 fs.renameMu.RLock() 1696 d, err := fs.resolveLocked(ctx, rp, &ds) 1697 if err != nil { 1698 fs.renameMuRUnlockAndCheckCaching(ctx, &ds) 1699 return err 1700 } 1701 err = d.setXattr(ctx, rp.Credentials(), &opts) 1702 fs.renameMuRUnlockAndCheckCaching(ctx, &ds) 1703 if err != nil { 1704 return err 1705 } 1706 1707 d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent) 1708 return nil 1709 } 1710 1711 // RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt. 1712 func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { 1713 var ds *[]*dentry 1714 fs.renameMu.RLock() 1715 d, err := fs.resolveLocked(ctx, rp, &ds) 1716 if err != nil { 1717 fs.renameMuRUnlockAndCheckCaching(ctx, &ds) 1718 return err 1719 } 1720 err = d.removeXattr(ctx, rp.Credentials(), name) 1721 fs.renameMuRUnlockAndCheckCaching(ctx, &ds) 1722 if err != nil { 1723 return err 1724 } 1725 1726 d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent) 1727 return nil 1728 } 1729 1730 // PrependPath implements vfs.FilesystemImpl.PrependPath. 1731 func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { 1732 fs.renameMu.RLock() 1733 defer fs.renameMu.RUnlock() 1734 return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*dentry), b) 1735 } 1736 1737 type mopt struct { 1738 key string 1739 value any 1740 } 1741 1742 func (m mopt) String() string { 1743 if m.value == nil { 1744 return fmt.Sprintf("%s", m.key) 1745 } 1746 return fmt.Sprintf("%s=%v", m.key, m.value) 1747 } 1748 1749 // MountOptions implements vfs.FilesystemImpl.MountOptions. 1750 func (fs *filesystem) MountOptions() string { 1751 optsKV := []mopt{ 1752 {moptTransport, transportModeFD}, // Only valid value, currently. 1753 {moptReadFD, fs.opts.fd}, // Currently, read and write FD are the same. 1754 {moptWriteFD, fs.opts.fd}, // Currently, read and write FD are the same. 1755 {moptAname, fs.opts.aname}, 1756 {moptDfltUID, fs.opts.dfltuid}, 1757 {moptDfltGID, fs.opts.dfltgid}, 1758 } 1759 1760 switch fs.opts.interop { 1761 case InteropModeExclusive: 1762 optsKV = append(optsKV, mopt{moptCache, cacheFSCache}) 1763 case InteropModeWritethrough: 1764 optsKV = append(optsKV, mopt{moptCache, cacheFSCacheWritethrough}) 1765 case InteropModeShared: 1766 optsKV = append(optsKV, mopt{moptCache, cacheRemoteRevalidating}) 1767 } 1768 if fs.opts.regularFilesUseSpecialFileFD { 1769 optsKV = append(optsKV, mopt{moptDisableFileHandleSharing, nil}) 1770 } 1771 if fs.opts.disableFifoOpen { 1772 optsKV = append(optsKV, mopt{moptDisableFifoOpen, nil}) 1773 } 1774 if fs.opts.forcePageCache { 1775 optsKV = append(optsKV, mopt{moptForcePageCache, nil}) 1776 } 1777 if fs.opts.limitHostFDTranslation { 1778 optsKV = append(optsKV, mopt{moptLimitHostFDTranslation, nil}) 1779 } 1780 if fs.opts.overlayfsStaleRead { 1781 optsKV = append(optsKV, mopt{moptOverlayfsStaleRead, nil}) 1782 } 1783 if fs.opts.directfs.enabled { 1784 optsKV = append(optsKV, mopt{moptDirectfs, nil}) 1785 } 1786 1787 opts := make([]string, 0, len(optsKV)) 1788 for _, opt := range optsKV { 1789 opts = append(opts, opt.String()) 1790 } 1791 return strings.Join(opts, ",") 1792 } 1793 1794 // IsDescendant implements vfs.FilesystemImpl.IsDescendant. 1795 func (fs *filesystem) IsDescendant(vfsroot, vd vfs.VirtualDentry) bool { 1796 return genericIsDescendant(vfsroot.Dentry(), vd.Dentry().Impl().(*dentry)) 1797 }