gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/fsimpl/tmpfs/filesystem.go (about) 1 // Copyright 2019 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package tmpfs 16 17 import ( 18 "fmt" 19 20 "gvisor.dev/gvisor/pkg/abi/linux" 21 "gvisor.dev/gvisor/pkg/context" 22 "gvisor.dev/gvisor/pkg/errors/linuxerr" 23 "gvisor.dev/gvisor/pkg/fspath" 24 "gvisor.dev/gvisor/pkg/refs" 25 "gvisor.dev/gvisor/pkg/sentry/fsmetric" 26 "gvisor.dev/gvisor/pkg/sentry/kernel/auth" 27 "gvisor.dev/gvisor/pkg/sentry/socket/unix/transport" 28 "gvisor.dev/gvisor/pkg/sentry/vfs" 29 ) 30 31 const ( 32 // direntSize is the size of each directory entry 33 // that Linux uses for computing directory size. 34 // "20" is mm/shmem.c:BOGO_DIRENT_SIZE. 35 direntSize = 20 36 // Linux implementation uses a SHORT_SYMLINK_LEN 128. 37 // It accounts size for only SYMLINK with size >= 128. 38 shortSymlinkLen = 128 39 ) 40 41 // Sync implements vfs.FilesystemImpl.Sync. 42 func (fs *filesystem) Sync(ctx context.Context) error { 43 // All filesystem state is in-memory. 44 return nil 45 } 46 47 // stepLocked resolves rp.Component() to an existing file, starting from the 48 // given directory. 49 // 50 // stepLocked is loosely analogous to fs/namei.c:walk_component(). 51 // 52 // Preconditions: 53 // - filesystem.mu must be locked. 54 // - !rp.Done(). 55 func stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry) (*dentry, bool, error) { 56 dir, ok := d.inode.impl.(*directory) 57 if !ok { 58 return nil, false, linuxerr.ENOTDIR 59 } 60 if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { 61 return nil, false, err 62 } 63 name := rp.Component() 64 if name == "." { 65 rp.Advance() 66 return d, false, nil 67 } 68 if name == ".." { 69 if isRoot, err := rp.CheckRoot(ctx, &d.vfsd); err != nil { 70 return nil, false, err 71 } else if isRoot || d.parent.Load() == nil { 72 rp.Advance() 73 return d, false, nil 74 } 75 if err := rp.CheckMount(ctx, &d.parent.Load().vfsd); err != nil { 76 return nil, false, err 77 } 78 rp.Advance() 79 return d.parent.Load(), false, nil 80 } 81 if len(name) > d.inode.fs.maxFilenameLen { 82 return nil, false, linuxerr.ENAMETOOLONG 83 } 84 child, ok := dir.childMap[name] 85 if !ok { 86 return nil, false, linuxerr.ENOENT 87 } 88 if err := rp.CheckMount(ctx, &child.vfsd); err != nil { 89 return nil, false, err 90 } 91 if symlink, ok := child.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() { 92 // Symlink traversal updates access time. 93 child.inode.touchAtime(rp.Mount()) 94 followedSymlink, err := rp.HandleSymlink(symlink.target) 95 return d, followedSymlink, err 96 } 97 rp.Advance() 98 return child, false, nil 99 } 100 101 // walkParentDirLocked resolves all but the last path component of rp to an 102 // existing directory, starting from the given directory (which is usually 103 // rp.Start().Impl().(*dentry)). It does not check that the returned directory 104 // is searchable by the provider of rp. 105 // 106 // walkParentDirLocked is loosely analogous to Linux's 107 // fs/namei.c:path_parentat(). 108 // 109 // Preconditions: 110 // - filesystem.mu must be locked. 111 // - !rp.Done(). 112 func walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry) (*directory, error) { 113 for !rp.Final() { 114 next, _, err := stepLocked(ctx, rp, d) 115 if err != nil { 116 return nil, err 117 } 118 d = next 119 } 120 dir, ok := d.inode.impl.(*directory) 121 if !ok { 122 return nil, linuxerr.ENOTDIR 123 } 124 return dir, nil 125 } 126 127 // resolveLocked resolves rp to an existing file. 128 // 129 // resolveLocked is loosely analogous to Linux's fs/namei.c:path_lookupat(). 130 // 131 // Preconditions: filesystem.mu must be locked. 132 func resolveLocked(ctx context.Context, rp *vfs.ResolvingPath) (*dentry, error) { 133 d := rp.Start().Impl().(*dentry) 134 135 if symlink, ok := d.inode.impl.(*symlink); rp.Done() && ok && rp.ShouldFollowSymlink() { 136 // Path with a single component. We don't need to step to the next 137 // component, but still need to resolve any symlinks. 138 // 139 // Symlink traversal updates access time. 140 d.inode.touchAtime(rp.Mount()) 141 if _, err := rp.HandleSymlink(symlink.target); err != nil { 142 return nil, err 143 } 144 } else { 145 // Path with multiple components, walk and resolve as required. 146 for !rp.Done() { 147 next, _, err := stepLocked(ctx, rp, d) 148 if err != nil { 149 return nil, err 150 } 151 d = next 152 } 153 } 154 155 if rp.MustBeDir() && !d.inode.isDir() { 156 return nil, linuxerr.ENOTDIR 157 } 158 return d, nil 159 } 160 161 // doCreateAt checks that creating a file at rp is permitted, then invokes 162 // create to do so. 163 // 164 // doCreateAt is loosely analogous to a conjunction of Linux's 165 // fs/namei.c:filename_create() and done_path_create(). 166 // 167 // Preconditions: 168 // - !rp.Done(). 169 // - For the final path component in rp, !rp.ShouldFollowSymlink(). 170 func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, create func(parentDir *directory, name string) error) error { 171 fs.mu.Lock() 172 defer fs.mu.Unlock() 173 parentDir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry)) 174 if err != nil { 175 return err 176 } 177 178 // Order of checks is important. First check if parent directory can be 179 // executed, then check for existence, and lastly check if mount is writable. 180 if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { 181 return err 182 } 183 name := rp.Component() 184 if name == "." || name == ".." { 185 return linuxerr.EEXIST 186 } 187 if len(name) > fs.maxFilenameLen { 188 return linuxerr.ENAMETOOLONG 189 } 190 if _, ok := parentDir.childMap[name]; ok { 191 return linuxerr.EEXIST 192 } 193 if !dir && rp.MustBeDir() { 194 return linuxerr.ENOENT 195 } 196 // tmpfs never calls VFS.InvalidateDentry(), so parentDir.dentry can only 197 // be dead if it was deleted. 198 if parentDir.dentry.vfsd.IsDead() { 199 return linuxerr.ENOENT 200 } 201 mnt := rp.Mount() 202 if err := mnt.CheckBeginWrite(); err != nil { 203 return err 204 } 205 defer mnt.EndWrite() 206 207 if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { 208 return err 209 } 210 if err := create(parentDir, name); err != nil { 211 return err 212 } 213 214 ev := linux.IN_CREATE 215 if dir { 216 ev |= linux.IN_ISDIR 217 } 218 parentDir.inode.watches.Notify(ctx, name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */) 219 parentDir.inode.touchCMtime() 220 return nil 221 } 222 223 // AccessAt implements vfs.Filesystem.Impl.AccessAt. 224 func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error { 225 fs.mu.RLock() 226 defer fs.mu.RUnlock() 227 d, err := resolveLocked(ctx, rp) 228 if err != nil { 229 return err 230 } 231 if err := d.inode.checkPermissions(creds, ats); err != nil { 232 return err 233 } 234 if ats.MayWrite() && rp.Mount().ReadOnly() { 235 return linuxerr.EROFS 236 } 237 return nil 238 } 239 240 // GetDentryAt implements vfs.FilesystemImpl.GetDentryAt. 241 func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { 242 fs.mu.RLock() 243 defer fs.mu.RUnlock() 244 d, err := resolveLocked(ctx, rp) 245 if err != nil { 246 return nil, err 247 } 248 if opts.CheckSearchable { 249 if !d.inode.isDir() { 250 return nil, linuxerr.ENOTDIR 251 } 252 if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { 253 return nil, err 254 } 255 } 256 d.IncRef() 257 return &d.vfsd, nil 258 } 259 260 // GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt. 261 func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) { 262 fs.mu.RLock() 263 defer fs.mu.RUnlock() 264 dir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry)) 265 if err != nil { 266 return nil, err 267 } 268 dir.dentry.IncRef() 269 return &dir.dentry.vfsd, nil 270 } 271 272 // LinkAt implements vfs.FilesystemImpl.LinkAt. 273 func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error { 274 return fs.doCreateAt(ctx, rp, false /* dir */, func(parentDir *directory, name string) error { 275 if rp.Mount() != vd.Mount() { 276 return linuxerr.EXDEV 277 } 278 d := vd.Dentry().Impl().(*dentry) 279 i := d.inode 280 if i.isDir() { 281 return linuxerr.EPERM 282 } 283 if err := vfs.MayLink(auth.CredentialsFromContext(ctx), linux.FileMode(i.mode.Load()), auth.KUID(i.uid.Load()), auth.KGID(i.gid.Load())); err != nil { 284 return err 285 } 286 if i.nlink.Load() == 0 { 287 return linuxerr.ENOENT 288 } 289 if i.nlink.Load() == maxLinks { 290 return linuxerr.EMLINK 291 } 292 i.incLinksLocked() 293 i.watches.Notify(ctx, "", linux.IN_ATTRIB, 0, vfs.InodeEvent, false /* unlinked */) 294 parentDir.insertChildLocked(fs.newDentry(i), name) 295 return nil 296 }) 297 } 298 299 // MkdirAt implements vfs.FilesystemImpl.MkdirAt. 300 func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error { 301 return fs.doCreateAt(ctx, rp, true /* dir */, func(parentDir *directory, name string) error { 302 creds := rp.Credentials() 303 if parentDir.inode.nlink.Load() == maxLinks { 304 return linuxerr.EMLINK 305 } 306 parentDir.inode.incLinksLocked() // from child's ".." 307 childDir := fs.newDirectory(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, parentDir) 308 parentDir.insertChildLocked(&childDir.dentry, name) 309 return nil 310 }) 311 } 312 313 // MknodAt implements vfs.FilesystemImpl.MknodAt. 314 func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error { 315 return fs.doCreateAt(ctx, rp, false /* dir */, func(parentDir *directory, name string) error { 316 creds := rp.Credentials() 317 var childInode *inode 318 switch opts.Mode.FileType() { 319 case linux.S_IFREG: 320 childInode = fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, parentDir) 321 case linux.S_IFIFO: 322 childInode = fs.newNamedPipe(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, parentDir) 323 case linux.S_IFBLK: 324 childInode = fs.newDeviceFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, vfs.BlockDevice, opts.DevMajor, opts.DevMinor, parentDir) 325 case linux.S_IFCHR: 326 childInode = fs.newDeviceFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, vfs.CharDevice, opts.DevMajor, opts.DevMinor, parentDir) 327 case linux.S_IFSOCK: 328 childInode = fs.newSocketFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, opts.Endpoint, parentDir) 329 default: 330 return linuxerr.EINVAL 331 } 332 child := fs.newDentry(childInode) 333 parentDir.insertChildLocked(child, name) 334 return nil 335 }) 336 } 337 338 // OpenAt implements vfs.FilesystemImpl.OpenAt. 339 func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) { 340 if opts.Flags&linux.O_TMPFILE != 0 { 341 // Not yet supported. 342 return nil, linuxerr.EOPNOTSUPP 343 } 344 345 // Handle O_CREAT and !O_CREAT separately, since in the latter case we 346 // don't need fs.mu for writing. 347 if opts.Flags&linux.O_CREAT == 0 { 348 fs.mu.RLock() 349 d, err := resolveLocked(ctx, rp) 350 if err != nil { 351 fs.mu.RUnlock() 352 return nil, err 353 } 354 d.IncRef() 355 defer d.DecRef(ctx) 356 fs.mu.RUnlock() 357 return d.open(ctx, rp, &opts, false /* afterCreate */) 358 } 359 360 mustCreate := opts.Flags&linux.O_EXCL != 0 361 start := rp.Start().Impl().(*dentry) 362 fs.mu.Lock() 363 unlocked := false 364 unlock := func() { 365 if !unlocked { 366 fs.mu.Unlock() 367 unlocked = true 368 } 369 } 370 defer unlock() 371 if rp.Done() { 372 // Reject attempts to open mount root directory with O_CREAT. 373 if rp.MustBeDir() { 374 return nil, linuxerr.EISDIR 375 } 376 if mustCreate { 377 return nil, linuxerr.EEXIST 378 } 379 start.IncRef() 380 defer start.DecRef(ctx) 381 unlock() 382 return start.open(ctx, rp, &opts, false /* afterCreate */) 383 } 384 afterTrailingSymlink: 385 parentDir, err := walkParentDirLocked(ctx, rp, start) 386 if err != nil { 387 return nil, err 388 } 389 // Check for search permission in the parent directory. 390 if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { 391 return nil, err 392 } 393 // Reject attempts to open directories with O_CREAT. 394 if rp.MustBeDir() { 395 return nil, linuxerr.EISDIR 396 } 397 name := rp.Component() 398 child, followedSymlink, err := stepLocked(ctx, rp, &parentDir.dentry) 399 if followedSymlink { 400 if mustCreate { 401 // EEXIST must be returned if an existing symlink is opened with O_EXCL. 402 return nil, linuxerr.EEXIST 403 } 404 if err != nil { 405 // If followedSymlink && err != nil, then this symlink resolution error 406 // must be handled by the VFS layer. 407 return nil, err 408 } 409 start = &parentDir.dentry 410 goto afterTrailingSymlink 411 } 412 if linuxerr.Equals(linuxerr.ENOENT, err) { 413 // Already checked for searchability above; now check for writability. 414 if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { 415 return nil, err 416 } 417 if err := rp.Mount().CheckBeginWrite(); err != nil { 418 return nil, err 419 } 420 defer rp.Mount().EndWrite() 421 // Create and open the child. 422 creds := rp.Credentials() 423 child := fs.newDentry(fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, parentDir)) 424 parentDir.insertChildLocked(child, name) 425 child.IncRef() 426 defer child.DecRef(ctx) 427 unlock() 428 fd, err := child.open(ctx, rp, &opts, true) 429 if err != nil { 430 return nil, err 431 } 432 parentDir.inode.watches.Notify(ctx, name, linux.IN_CREATE, 0, vfs.PathEvent, false /* unlinked */) 433 parentDir.inode.touchCMtime() 434 return fd, nil 435 } 436 if err != nil { 437 return nil, err 438 } 439 if mustCreate { 440 return nil, linuxerr.EEXIST 441 } 442 if rp.MustBeDir() && !child.inode.isDir() { 443 return nil, linuxerr.ENOTDIR 444 } 445 child.IncRef() 446 defer child.DecRef(ctx) 447 unlock() 448 return child.open(ctx, rp, &opts, false) 449 } 450 451 // Preconditions: The caller must hold no locks (since opening pipes may block 452 // indefinitely). 453 func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions, afterCreate bool) (*vfs.FileDescription, error) { 454 ats := vfs.AccessTypesForOpenFlags(opts) 455 if !afterCreate { 456 if err := d.inode.checkPermissions(rp.Credentials(), ats); err != nil { 457 return nil, err 458 } 459 } 460 switch impl := d.inode.impl.(type) { 461 case *regularFile: 462 var fd regularFileFD 463 fd.LockFD.Init(&d.inode.locks) 464 if err := fd.vfsfd.Init(&fd, opts.Flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{AllowDirectIO: true}); err != nil { 465 return nil, err 466 } 467 if !afterCreate && opts.Flags&linux.O_TRUNC != 0 { 468 if _, err := impl.truncate(0); err != nil { 469 return nil, err 470 } 471 } 472 if fd.vfsfd.IsWritable() { 473 fsmetric.TmpfsOpensW.Increment() 474 } else if fd.vfsfd.IsReadable() { 475 fsmetric.TmpfsOpensRO.Increment() 476 } 477 return &fd.vfsfd, nil 478 case *directory: 479 // Can't open directories with O_CREAT. 480 if opts.Flags&linux.O_CREAT != 0 { 481 return nil, linuxerr.EISDIR 482 } 483 // Can't open directories writably. 484 if ats&vfs.MayWrite != 0 { 485 return nil, linuxerr.EISDIR 486 } 487 if opts.Flags&linux.O_DIRECT != 0 { 488 return nil, linuxerr.EINVAL 489 } 490 var fd directoryFD 491 fd.LockFD.Init(&d.inode.locks) 492 if err := fd.vfsfd.Init(&fd, opts.Flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{AllowDirectIO: true}); err != nil { 493 return nil, err 494 } 495 return &fd.vfsfd, nil 496 case *symlink: 497 // Can't open symlinks without O_PATH, which is handled at the VFS layer. 498 return nil, linuxerr.ELOOP 499 case *namedPipe: 500 return impl.pipe.Open(ctx, rp.Mount(), &d.vfsd, opts.Flags, &d.inode.locks) 501 case *deviceFile: 502 return rp.VirtualFilesystem().OpenDeviceSpecialFile(ctx, rp.Mount(), &d.vfsd, impl.kind, impl.major, impl.minor, opts) 503 case *socketFile: 504 return nil, linuxerr.ENXIO 505 default: 506 panic(fmt.Sprintf("unknown inode type: %T", d.inode.impl)) 507 } 508 } 509 510 // ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt. 511 func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) { 512 fs.mu.RLock() 513 defer fs.mu.RUnlock() 514 d, err := resolveLocked(ctx, rp) 515 if err != nil { 516 return "", err 517 } 518 symlink, ok := d.inode.impl.(*symlink) 519 if !ok { 520 return "", linuxerr.EINVAL 521 } 522 symlink.inode.touchAtime(rp.Mount()) 523 return symlink.target, nil 524 } 525 526 // RenameAt implements vfs.FilesystemImpl.RenameAt. 527 func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error { 528 // Resolve newParentDir first to verify that it's on this Mount. 529 fs.mu.Lock() 530 // We need to DecRef outside of fs.mu because forgetting a dead mountpoint 531 // could result in this filesystem being released which acquires fs.mu. 532 var toDecRef []refs.RefCounter 533 defer func() { 534 for _, ref := range toDecRef { 535 ref.DecRef(ctx) 536 } 537 }() 538 defer fs.mu.Unlock() 539 newParentDir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry)) 540 if err != nil { 541 return err 542 } 543 544 if opts.Flags&^linux.RENAME_NOREPLACE != 0 { 545 // TODO(b/145974740): Support other renameat2 flags. 546 return linuxerr.EINVAL 547 } 548 549 newName := rp.Component() 550 if newName == "." || newName == ".." { 551 if opts.Flags&linux.RENAME_NOREPLACE != 0 { 552 return linuxerr.EEXIST 553 } 554 return linuxerr.EBUSY 555 } 556 if len(newName) > fs.maxFilenameLen { 557 return linuxerr.ENAMETOOLONG 558 } 559 mnt := rp.Mount() 560 if mnt != oldParentVD.Mount() { 561 return linuxerr.EXDEV 562 } 563 if err := mnt.CheckBeginWrite(); err != nil { 564 return err 565 } 566 defer mnt.EndWrite() 567 568 oldParentDir := oldParentVD.Dentry().Impl().(*dentry).inode.impl.(*directory) 569 if err := oldParentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { 570 return err 571 } 572 renamed, ok := oldParentDir.childMap[oldName] 573 if !ok { 574 return linuxerr.ENOENT 575 } 576 if err := oldParentDir.mayDelete(rp.Credentials(), renamed); err != nil { 577 return err 578 } 579 // Note that we don't need to call rp.CheckMount(), since if renamed is a 580 // mount point then we want to rename the mount point, not anything in the 581 // mounted filesystem. 582 if renamed.inode.isDir() { 583 if renamed == &newParentDir.dentry || genericIsAncestorDentry(renamed, &newParentDir.dentry) { 584 return linuxerr.EINVAL 585 } 586 if oldParentDir != newParentDir { 587 // Writability is needed to change renamed's "..". 588 if err := renamed.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { 589 return err 590 } 591 } 592 } else { 593 if opts.MustBeDir || rp.MustBeDir() { 594 return linuxerr.ENOTDIR 595 } 596 } 597 598 if err := newParentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { 599 return err 600 } 601 replaced, ok := newParentDir.childMap[newName] 602 if ok { 603 if opts.Flags&linux.RENAME_NOREPLACE != 0 { 604 return linuxerr.EEXIST 605 } 606 replacedDir, ok := replaced.inode.impl.(*directory) 607 if ok { 608 if !renamed.inode.isDir() { 609 return linuxerr.EISDIR 610 } 611 if len(replacedDir.childMap) != 0 { 612 return linuxerr.ENOTEMPTY 613 } 614 } else { 615 if rp.MustBeDir() { 616 return linuxerr.ENOTDIR 617 } 618 if renamed.inode.isDir() { 619 return linuxerr.ENOTDIR 620 } 621 } 622 } else { 623 if renamed.inode.isDir() && newParentDir.inode.nlink.Load() == maxLinks { 624 return linuxerr.EMLINK 625 } 626 } 627 // tmpfs never calls VFS.InvalidateDentry(), so newParentDir.dentry can 628 // only be dead if it was deleted. 629 if newParentDir.dentry.vfsd.IsDead() { 630 return linuxerr.ENOENT 631 } 632 633 // Linux places this check before some of those above; we do it here for 634 // simplicity, under the assumption that applications are not intentionally 635 // doing noop renames expecting them to succeed where non-noop renames 636 // would fail. 637 if renamed == replaced { 638 return nil 639 } 640 vfsObj := rp.VirtualFilesystem() 641 mntns := vfs.MountNamespaceFromContext(ctx) 642 defer mntns.DecRef(ctx) 643 var replacedVFSD *vfs.Dentry 644 if replaced != nil { 645 replacedVFSD = &replaced.vfsd 646 } 647 if err := vfsObj.PrepareRenameDentry(mntns, &renamed.vfsd, replacedVFSD); err != nil { 648 return err 649 } 650 if replaced != nil { 651 newParentDir.removeChildLocked(replaced) 652 if replaced.inode.isDir() { 653 // Remove links for replaced/. and replaced/.. 654 replaced.inode.decLinksLocked(ctx) 655 newParentDir.inode.decLinksLocked(ctx) 656 } 657 replaced.inode.decLinksLocked(ctx) 658 } 659 oldParentDir.removeChildLocked(renamed) 660 newParentDir.insertChildLocked(renamed, newName) 661 toDecRef = vfsObj.CommitRenameReplaceDentry(ctx, &renamed.vfsd, replacedVFSD) 662 oldParentDir.inode.touchCMtime() 663 if oldParentDir != newParentDir { 664 if renamed.inode.isDir() { 665 oldParentDir.inode.decLinksLocked(ctx) 666 newParentDir.inode.incLinksLocked() 667 } 668 newParentDir.inode.touchCMtime() 669 } 670 renamed.inode.touchCtime() 671 672 vfs.InotifyRename(ctx, &renamed.inode.watches, &oldParentDir.inode.watches, &newParentDir.inode.watches, oldName, newName, renamed.inode.isDir()) 673 return nil 674 } 675 676 // RmdirAt implements vfs.FilesystemImpl.RmdirAt. 677 func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error { 678 fs.mu.Lock() 679 // We need to DecRef outside of fs.mu because forgetting a dead mountpoint 680 // could result in this filesystem being released which acquires fs.mu. 681 var toDecRef []refs.RefCounter 682 defer func() { 683 for _, ref := range toDecRef { 684 ref.DecRef(ctx) 685 } 686 }() 687 defer fs.mu.Unlock() 688 parentDir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry)) 689 if err != nil { 690 return err 691 } 692 if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { 693 return err 694 } 695 name := rp.Component() 696 if name == "." { 697 return linuxerr.EINVAL 698 } 699 if name == ".." { 700 return linuxerr.ENOTEMPTY 701 } 702 child, ok := parentDir.childMap[name] 703 if !ok { 704 return linuxerr.ENOENT 705 } 706 if err := parentDir.mayDelete(rp.Credentials(), child); err != nil { 707 return err 708 } 709 childDir, ok := child.inode.impl.(*directory) 710 if !ok { 711 return linuxerr.ENOTDIR 712 } 713 if len(childDir.childMap) != 0 { 714 return linuxerr.ENOTEMPTY 715 } 716 mnt := rp.Mount() 717 if err := mnt.CheckBeginWrite(); err != nil { 718 return err 719 } 720 defer mnt.EndWrite() 721 vfsObj := rp.VirtualFilesystem() 722 mntns := vfs.MountNamespaceFromContext(ctx) 723 defer mntns.DecRef(ctx) 724 if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil { 725 return err 726 } 727 parentDir.removeChildLocked(child) 728 parentDir.inode.watches.Notify(ctx, name, linux.IN_DELETE|linux.IN_ISDIR, 0, vfs.InodeEvent, true /* unlinked */) 729 // Remove links for child, child/., and child/.. 730 child.inode.decLinksLocked(ctx) 731 child.inode.decLinksLocked(ctx) 732 parentDir.inode.decLinksLocked(ctx) 733 toDecRef = vfsObj.CommitDeleteDentry(ctx, &child.vfsd) 734 parentDir.inode.touchCMtime() 735 return nil 736 } 737 738 // SetStatAt implements vfs.FilesystemImpl.SetStatAt. 739 func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { 740 fs.mu.RLock() 741 d, err := resolveLocked(ctx, rp) 742 if err != nil { 743 fs.mu.RUnlock() 744 return err 745 } 746 err = d.inode.setStat(ctx, rp.Credentials(), &opts) 747 fs.mu.RUnlock() 748 if err != nil { 749 return err 750 } 751 752 if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 { 753 d.InotifyWithParent(ctx, ev, 0, vfs.InodeEvent) 754 } 755 return nil 756 } 757 758 // StatAt implements vfs.FilesystemImpl.StatAt. 759 func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) { 760 fs.mu.RLock() 761 defer fs.mu.RUnlock() 762 d, err := resolveLocked(ctx, rp) 763 if err != nil { 764 return linux.Statx{}, err 765 } 766 var stat linux.Statx 767 d.inode.statTo(&stat) 768 return stat, nil 769 } 770 771 // StatFSAt implements vfs.FilesystemImpl.StatFSAt. 772 func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { 773 fs.mu.RLock() 774 defer fs.mu.RUnlock() 775 if _, err := resolveLocked(ctx, rp); err != nil { 776 return linux.Statfs{}, err 777 } 778 return fs.statFS(), nil 779 } 780 781 // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. 782 func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error { 783 return fs.doCreateAt(ctx, rp, false /* dir */, func(parentDir *directory, name string) error { 784 // Linux allocates a page to store symlink targets that have length larger 785 // than shortSymlinkLen. Targets are just stored as string here, but simulate 786 // the page accounting for it. See mm/shmem.c:shmem_symlink(). 787 if len(target) >= shortSymlinkLen { 788 if !fs.accountPages(1) { 789 return linuxerr.ENOSPC 790 } 791 } 792 creds := rp.Credentials() 793 child := fs.newDentry(fs.newSymlink(creds.EffectiveKUID, creds.EffectiveKGID, 0777, target, parentDir)) 794 parentDir.insertChildLocked(child, name) 795 return nil 796 }) 797 } 798 799 // UnlinkAt implements vfs.FilesystemImpl.UnlinkAt. 800 func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error { 801 fs.mu.Lock() 802 // We need to DecRef outside of fs.mu because forgetting a dead mountpoint 803 // could result in this filesystem being released which acquires fs.mu. 804 var toDecRef []refs.RefCounter 805 defer func() { 806 for _, ref := range toDecRef { 807 ref.DecRef(ctx) 808 } 809 }() 810 defer fs.mu.Unlock() 811 parentDir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry)) 812 if err != nil { 813 return err 814 } 815 if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { 816 return err 817 } 818 name := rp.Component() 819 if name == "." || name == ".." { 820 return linuxerr.EISDIR 821 } 822 child, ok := parentDir.childMap[name] 823 if !ok { 824 return linuxerr.ENOENT 825 } 826 if err := parentDir.mayDelete(rp.Credentials(), child); err != nil { 827 return err 828 } 829 if child.inode.isDir() { 830 return linuxerr.EISDIR 831 } 832 if rp.MustBeDir() { 833 return linuxerr.ENOTDIR 834 } 835 mnt := rp.Mount() 836 if err := mnt.CheckBeginWrite(); err != nil { 837 return err 838 } 839 defer mnt.EndWrite() 840 vfsObj := rp.VirtualFilesystem() 841 mntns := vfs.MountNamespaceFromContext(ctx) 842 defer mntns.DecRef(ctx) 843 if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil { 844 return err 845 } 846 // Generate inotify events. Note that this must take place before the link 847 // count of the child is decremented, or else the watches may be dropped 848 // before these events are added. 849 vfs.InotifyRemoveChild(ctx, &child.inode.watches, &parentDir.inode.watches, name) 850 parentDir.removeChildLocked(child) 851 child.inode.decLinksLocked(ctx) 852 toDecRef = vfsObj.CommitDeleteDentry(ctx, &child.vfsd) 853 parentDir.inode.touchCMtime() 854 return nil 855 } 856 857 // BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt. 858 func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) { 859 fs.mu.RLock() 860 defer fs.mu.RUnlock() 861 d, err := resolveLocked(ctx, rp) 862 if err != nil { 863 return nil, err 864 } 865 if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { 866 return nil, err 867 } 868 switch impl := d.inode.impl.(type) { 869 case *socketFile: 870 if impl.ep == nil { 871 return nil, linuxerr.ECONNREFUSED 872 } 873 return impl.ep, nil 874 default: 875 return nil, linuxerr.ECONNREFUSED 876 } 877 } 878 879 // ListXattrAt implements vfs.FilesystemImpl.ListXattrAt. 880 func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { 881 fs.mu.RLock() 882 defer fs.mu.RUnlock() 883 d, err := resolveLocked(ctx, rp) 884 if err != nil { 885 return nil, err 886 } 887 return d.inode.listXattr(rp.Credentials(), size) 888 } 889 890 // GetXattrAt implements vfs.FilesystemImpl.GetXattrAt. 891 func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) { 892 fs.mu.RLock() 893 defer fs.mu.RUnlock() 894 d, err := resolveLocked(ctx, rp) 895 if err != nil { 896 return "", err 897 } 898 return d.inode.getXattr(rp.Credentials(), &opts) 899 } 900 901 // SetXattrAt implements vfs.FilesystemImpl.SetXattrAt. 902 func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error { 903 fs.mu.RLock() 904 d, err := resolveLocked(ctx, rp) 905 if err != nil { 906 fs.mu.RUnlock() 907 return err 908 } 909 err = d.inode.setXattr(rp.Credentials(), &opts) 910 fs.mu.RUnlock() 911 if err != nil { 912 return err 913 } 914 915 d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent) 916 return nil 917 } 918 919 // RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt. 920 func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { 921 fs.mu.RLock() 922 d, err := resolveLocked(ctx, rp) 923 if err != nil { 924 fs.mu.RUnlock() 925 return err 926 } 927 err = d.inode.removeXattr(rp.Credentials(), name) 928 fs.mu.RUnlock() 929 if err != nil { 930 return err 931 } 932 933 d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent) 934 return nil 935 } 936 937 // PrependPath implements vfs.FilesystemImpl.PrependPath. 938 func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { 939 fs.mu.RLock() 940 defer fs.mu.RUnlock() 941 mnt := vd.Mount() 942 d := vd.Dentry().Impl().(*dentry) 943 for { 944 if mnt == vfsroot.Mount() && &d.vfsd == vfsroot.Dentry() { 945 return vfs.PrependPathAtVFSRootError{} 946 } 947 if mnt != nil && &d.vfsd == mnt.Root() { 948 return nil 949 } 950 parent := d.parent.Load() 951 if parent == nil { 952 if d.name != "" { 953 // This file must have been created by 954 // newUnlinkedRegularFileDescription(). In Linux, 955 // mm/shmem.c:__shmem_file_setup() => 956 // fs/file_table.c:alloc_file_pseudo() sets the created 957 // dentry's dentry_operations to anon_ops, for which d_dname == 958 // simple_dname. fs/d_path.c:simple_dname() defines the 959 // dentry's pathname to be its name, prefixed with "/" and 960 // suffixed with " (deleted)". 961 b.PrependComponent("/" + d.name) 962 b.AppendString(" (deleted)") 963 return vfs.PrependPathSyntheticError{} 964 } 965 return vfs.PrependPathAtNonMountRootError{} 966 } 967 b.PrependComponent(d.name) 968 d = parent 969 } 970 } 971 972 // MountOptions implements vfs.FilesystemImpl.MountOptions. 973 func (fs *filesystem) MountOptions() string { 974 return fs.mopts 975 } 976 977 // IsDescendant implements vfs.FilesystemImpl.IsDescendant. 978 func (fs *filesystem) IsDescendant(vfsroot, vd vfs.VirtualDentry) bool { 979 return genericIsDescendant(vfsroot.Dentry(), vd.Dentry().Impl().(*dentry)) 980 } 981 982 // adjustPageAcct adjusts the accounting done against filesystem size limit in 983 // case there is any discrepancy between the number of pages reserved vs the 984 // number of pages actually allocated. 985 func (fs *filesystem) adjustPageAcct(reserved, alloced uint64) { 986 if reserved < alloced { 987 panic(fmt.Sprintf("More pages were allocated than the pages reserved: reserved=%d, alloced=%d", reserved, alloced)) 988 } 989 if pagesDiff := reserved - alloced; pagesDiff > 0 { 990 fs.unaccountPages(pagesDiff) 991 } 992 } 993 994 // accountPagesPartial increases the pagesUsed if tmpfs is mounted with size 995 // option by as much as possible without going over the size mount option. It 996 // returns the number of pages that we were able to account for. It returns false 997 // when the maxSizeInPages has been exhausted and no more allocation can be done. 998 // The returned value is guaranteed to be <= pagesInc. If the size mount option is 999 // not set, then pagesInc will be returned. 1000 func (fs *filesystem) accountPagesPartial(pagesInc uint64) uint64 { 1001 if pagesInc == 0 { 1002 return pagesInc 1003 } 1004 1005 for { 1006 pagesUsed := fs.pagesUsed.Load() 1007 if fs.maxSizeInPages <= pagesUsed { 1008 return 0 1009 } 1010 1011 pagesFree := fs.maxSizeInPages - pagesUsed 1012 toInc := pagesInc 1013 if pagesFree < pagesInc { 1014 toInc = pagesFree 1015 } 1016 1017 if fs.pagesUsed.CompareAndSwap(pagesUsed, pagesUsed+toInc) { 1018 return toInc 1019 } 1020 } 1021 } 1022 1023 // accountPages increases the pagesUsed in filesystem struct if tmpfs 1024 // is mounted with size option. We return a false when the maxSizeInPages 1025 // has been exhausted and no more allocation can be done. 1026 func (fs *filesystem) accountPages(pagesInc uint64) bool { 1027 if pagesInc == 0 { 1028 return true // No accounting needed. 1029 } 1030 1031 for { 1032 pagesUsed := fs.pagesUsed.Load() 1033 if fs.maxSizeInPages <= pagesUsed { 1034 return false 1035 } 1036 1037 pagesFree := fs.maxSizeInPages - pagesUsed 1038 if pagesFree < pagesInc { 1039 return false 1040 } 1041 1042 if fs.pagesUsed.CompareAndSwap(pagesUsed, pagesUsed+pagesInc) { 1043 return true 1044 } 1045 } 1046 } 1047 1048 // unaccountPages decreases the pagesUsed in filesystem struct if tmpfs 1049 // is mounted with size option. 1050 func (fs *filesystem) unaccountPages(pagesDec uint64) { 1051 if pagesDec == 0 { 1052 return 1053 } 1054 1055 for { 1056 pagesUsed := fs.pagesUsed.Load() 1057 if pagesUsed < pagesDec { 1058 panic(fmt.Sprintf("Deallocating more pages than allocated: fs.pagesUsed = %d, pagesDec = %d", pagesUsed, pagesDec)) 1059 } 1060 if fs.pagesUsed.CompareAndSwap(pagesUsed, pagesUsed-pagesDec) { 1061 break 1062 } 1063 } 1064 }