github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/fsimpl/tmpfs/filesystem.go (about) 1 // Copyright 2019 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package tmpfs 16 17 import ( 18 "fmt" 19 20 "github.com/nicocha30/gvisor-ligolo/pkg/abi/linux" 21 "github.com/nicocha30/gvisor-ligolo/pkg/context" 22 "github.com/nicocha30/gvisor-ligolo/pkg/errors/linuxerr" 23 "github.com/nicocha30/gvisor-ligolo/pkg/fspath" 24 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/fsmetric" 25 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel/auth" 26 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/socket/unix/transport" 27 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/vfs" 28 ) 29 30 const ( 31 // direntSize is the size of each directory entry 32 // that Linux uses for computing directory size. 33 // "20" is mm/shmem.c:BOGO_DIRENT_SIZE. 34 direntSize = 20 35 // Linux implementation uses a SHORT_SYMLINK_LEN 128. 36 // It accounts size for only SYMLINK with size >= 128. 37 shortSymlinkLen = 128 38 ) 39 40 // Sync implements vfs.FilesystemImpl.Sync. 41 func (fs *filesystem) Sync(ctx context.Context) error { 42 // All filesystem state is in-memory. 43 return nil 44 } 45 46 // stepLocked resolves rp.Component() to an existing file, starting from the 47 // given directory. 48 // 49 // stepLocked is loosely analogous to fs/namei.c:walk_component(). 50 // 51 // Preconditions: 52 // - filesystem.mu must be locked. 53 // - !rp.Done(). 54 func stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry) (*dentry, bool, error) { 55 dir, ok := d.inode.impl.(*directory) 56 if !ok { 57 return nil, false, linuxerr.ENOTDIR 58 } 59 if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { 60 return nil, false, err 61 } 62 name := rp.Component() 63 if name == "." { 64 rp.Advance() 65 return d, false, nil 66 } 67 if name == ".." { 68 if isRoot, err := rp.CheckRoot(ctx, &d.vfsd); err != nil { 69 return nil, false, err 70 } else if isRoot || d.parent == nil { 71 rp.Advance() 72 return d, false, nil 73 } 74 if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil { 75 return nil, false, err 76 } 77 rp.Advance() 78 return d.parent, false, nil 79 } 80 if len(name) > d.inode.fs.maxFilenameLen { 81 return nil, false, linuxerr.ENAMETOOLONG 82 } 83 child, ok := dir.childMap[name] 84 if !ok { 85 return nil, false, linuxerr.ENOENT 86 } 87 if err := rp.CheckMount(ctx, &child.vfsd); err != nil { 88 return nil, false, err 89 } 90 if symlink, ok := child.inode.impl.(*symlink); ok && rp.ShouldFollowSymlink() { 91 // Symlink traversal updates access time. 92 child.inode.touchAtime(rp.Mount()) 93 followedSymlink, err := rp.HandleSymlink(symlink.target) 94 return d, followedSymlink, err 95 } 96 rp.Advance() 97 return child, false, nil 98 } 99 100 // walkParentDirLocked resolves all but the last path component of rp to an 101 // existing directory, starting from the given directory (which is usually 102 // rp.Start().Impl().(*dentry)). It does not check that the returned directory 103 // is searchable by the provider of rp. 104 // 105 // walkParentDirLocked is loosely analogous to Linux's 106 // fs/namei.c:path_parentat(). 107 // 108 // Preconditions: 109 // - filesystem.mu must be locked. 110 // - !rp.Done(). 111 func walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry) (*directory, error) { 112 for !rp.Final() { 113 next, _, err := stepLocked(ctx, rp, d) 114 if err != nil { 115 return nil, err 116 } 117 d = next 118 } 119 dir, ok := d.inode.impl.(*directory) 120 if !ok { 121 return nil, linuxerr.ENOTDIR 122 } 123 return dir, nil 124 } 125 126 // resolveLocked resolves rp to an existing file. 127 // 128 // resolveLocked is loosely analogous to Linux's fs/namei.c:path_lookupat(). 129 // 130 // Preconditions: filesystem.mu must be locked. 131 func resolveLocked(ctx context.Context, rp *vfs.ResolvingPath) (*dentry, error) { 132 d := rp.Start().Impl().(*dentry) 133 134 if symlink, ok := d.inode.impl.(*symlink); rp.Done() && ok && rp.ShouldFollowSymlink() { 135 // Path with a single component. We don't need to step to the next 136 // component, but still need to resolve any symlinks. 137 // 138 // Symlink traversal updates access time. 139 d.inode.touchAtime(rp.Mount()) 140 if _, err := rp.HandleSymlink(symlink.target); err != nil { 141 return nil, err 142 } 143 } else { 144 // Path with multiple components, walk and resolve as required. 145 for !rp.Done() { 146 next, _, err := stepLocked(ctx, rp, d) 147 if err != nil { 148 return nil, err 149 } 150 d = next 151 } 152 } 153 154 if rp.MustBeDir() && !d.inode.isDir() { 155 return nil, linuxerr.ENOTDIR 156 } 157 return d, nil 158 } 159 160 // doCreateAt checks that creating a file at rp is permitted, then invokes 161 // create to do so. 162 // 163 // doCreateAt is loosely analogous to a conjunction of Linux's 164 // fs/namei.c:filename_create() and done_path_create(). 165 // 166 // Preconditions: 167 // - !rp.Done(). 168 // - For the final path component in rp, !rp.ShouldFollowSymlink(). 169 func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, dir bool, create func(parentDir *directory, name string) error) error { 170 fs.mu.Lock() 171 defer fs.mu.Unlock() 172 parentDir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry)) 173 if err != nil { 174 return err 175 } 176 177 // Order of checks is important. First check if parent directory can be 178 // executed, then check for existence, and lastly check if mount is writable. 179 if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { 180 return err 181 } 182 name := rp.Component() 183 if name == "." || name == ".." { 184 return linuxerr.EEXIST 185 } 186 if len(name) > fs.maxFilenameLen { 187 return linuxerr.ENAMETOOLONG 188 } 189 if _, ok := parentDir.childMap[name]; ok { 190 return linuxerr.EEXIST 191 } 192 if !dir && rp.MustBeDir() { 193 return linuxerr.ENOENT 194 } 195 // tmpfs never calls VFS.InvalidateDentry(), so parentDir.dentry can only 196 // be dead if it was deleted. 197 if parentDir.dentry.vfsd.IsDead() { 198 return linuxerr.ENOENT 199 } 200 mnt := rp.Mount() 201 if err := mnt.CheckBeginWrite(); err != nil { 202 return err 203 } 204 defer mnt.EndWrite() 205 206 if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { 207 return err 208 } 209 if err := create(parentDir, name); err != nil { 210 return err 211 } 212 213 ev := linux.IN_CREATE 214 if dir { 215 ev |= linux.IN_ISDIR 216 } 217 parentDir.inode.watches.Notify(ctx, name, uint32(ev), 0, vfs.InodeEvent, false /* unlinked */) 218 parentDir.inode.touchCMtime() 219 return nil 220 } 221 222 // AccessAt implements vfs.Filesystem.Impl.AccessAt. 223 func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error { 224 fs.mu.RLock() 225 defer fs.mu.RUnlock() 226 d, err := resolveLocked(ctx, rp) 227 if err != nil { 228 return err 229 } 230 if err := d.inode.checkPermissions(creds, ats); err != nil { 231 return err 232 } 233 if ats.MayWrite() && rp.Mount().ReadOnly() { 234 return linuxerr.EROFS 235 } 236 return nil 237 } 238 239 // GetDentryAt implements vfs.FilesystemImpl.GetDentryAt. 240 func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) { 241 fs.mu.RLock() 242 defer fs.mu.RUnlock() 243 d, err := resolveLocked(ctx, rp) 244 if err != nil { 245 return nil, err 246 } 247 if opts.CheckSearchable { 248 if !d.inode.isDir() { 249 return nil, linuxerr.ENOTDIR 250 } 251 if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { 252 return nil, err 253 } 254 } 255 d.IncRef() 256 return &d.vfsd, nil 257 } 258 259 // GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt. 260 func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) { 261 fs.mu.RLock() 262 defer fs.mu.RUnlock() 263 dir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry)) 264 if err != nil { 265 return nil, err 266 } 267 dir.dentry.IncRef() 268 return &dir.dentry.vfsd, nil 269 } 270 271 // LinkAt implements vfs.FilesystemImpl.LinkAt. 272 func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error { 273 return fs.doCreateAt(ctx, rp, false /* dir */, func(parentDir *directory, name string) error { 274 if rp.Mount() != vd.Mount() { 275 return linuxerr.EXDEV 276 } 277 d := vd.Dentry().Impl().(*dentry) 278 i := d.inode 279 if i.isDir() { 280 return linuxerr.EPERM 281 } 282 if err := vfs.MayLink(auth.CredentialsFromContext(ctx), linux.FileMode(i.mode.Load()), auth.KUID(i.uid.Load()), auth.KGID(i.gid.Load())); err != nil { 283 return err 284 } 285 if i.nlink.Load() == 0 { 286 return linuxerr.ENOENT 287 } 288 if i.nlink.Load() == maxLinks { 289 return linuxerr.EMLINK 290 } 291 i.incLinksLocked() 292 i.watches.Notify(ctx, "", linux.IN_ATTRIB, 0, vfs.InodeEvent, false /* unlinked */) 293 parentDir.insertChildLocked(fs.newDentry(i), name) 294 return nil 295 }) 296 } 297 298 // MkdirAt implements vfs.FilesystemImpl.MkdirAt. 299 func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error { 300 return fs.doCreateAt(ctx, rp, true /* dir */, func(parentDir *directory, name string) error { 301 creds := rp.Credentials() 302 if parentDir.inode.nlink.Load() == maxLinks { 303 return linuxerr.EMLINK 304 } 305 parentDir.inode.incLinksLocked() // from child's ".." 306 childDir := fs.newDirectory(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, parentDir) 307 parentDir.insertChildLocked(&childDir.dentry, name) 308 return nil 309 }) 310 } 311 312 // MknodAt implements vfs.FilesystemImpl.MknodAt. 313 func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error { 314 return fs.doCreateAt(ctx, rp, false /* dir */, func(parentDir *directory, name string) error { 315 creds := rp.Credentials() 316 var childInode *inode 317 switch opts.Mode.FileType() { 318 case linux.S_IFREG: 319 childInode = fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, parentDir) 320 case linux.S_IFIFO: 321 childInode = fs.newNamedPipe(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, parentDir) 322 case linux.S_IFBLK: 323 childInode = fs.newDeviceFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, vfs.BlockDevice, opts.DevMajor, opts.DevMinor, parentDir) 324 case linux.S_IFCHR: 325 childInode = fs.newDeviceFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, vfs.CharDevice, opts.DevMajor, opts.DevMinor, parentDir) 326 case linux.S_IFSOCK: 327 childInode = fs.newSocketFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, opts.Endpoint, parentDir) 328 default: 329 return linuxerr.EINVAL 330 } 331 child := fs.newDentry(childInode) 332 parentDir.insertChildLocked(child, name) 333 return nil 334 }) 335 } 336 337 // OpenAt implements vfs.FilesystemImpl.OpenAt. 338 func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) { 339 if opts.Flags&linux.O_TMPFILE != 0 { 340 // Not yet supported. 341 return nil, linuxerr.EOPNOTSUPP 342 } 343 344 // Handle O_CREAT and !O_CREAT separately, since in the latter case we 345 // don't need fs.mu for writing. 346 if opts.Flags&linux.O_CREAT == 0 { 347 fs.mu.RLock() 348 d, err := resolveLocked(ctx, rp) 349 if err != nil { 350 fs.mu.RUnlock() 351 return nil, err 352 } 353 d.IncRef() 354 defer d.DecRef(ctx) 355 fs.mu.RUnlock() 356 return d.open(ctx, rp, &opts, false /* afterCreate */) 357 } 358 359 mustCreate := opts.Flags&linux.O_EXCL != 0 360 start := rp.Start().Impl().(*dentry) 361 fs.mu.Lock() 362 unlocked := false 363 unlock := func() { 364 if !unlocked { 365 fs.mu.Unlock() 366 unlocked = true 367 } 368 } 369 defer unlock() 370 if rp.Done() { 371 // Reject attempts to open mount root directory with O_CREAT. 372 if rp.MustBeDir() { 373 return nil, linuxerr.EISDIR 374 } 375 if mustCreate { 376 return nil, linuxerr.EEXIST 377 } 378 start.IncRef() 379 defer start.DecRef(ctx) 380 unlock() 381 return start.open(ctx, rp, &opts, false /* afterCreate */) 382 } 383 afterTrailingSymlink: 384 parentDir, err := walkParentDirLocked(ctx, rp, start) 385 if err != nil { 386 return nil, err 387 } 388 // Check for search permission in the parent directory. 389 if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil { 390 return nil, err 391 } 392 // Reject attempts to open directories with O_CREAT. 393 if rp.MustBeDir() { 394 return nil, linuxerr.EISDIR 395 } 396 name := rp.Component() 397 child, followedSymlink, err := stepLocked(ctx, rp, &parentDir.dentry) 398 if followedSymlink { 399 if mustCreate { 400 // EEXIST must be returned if an existing symlink is opened with O_EXCL. 401 return nil, linuxerr.EEXIST 402 } 403 if err != nil { 404 // If followedSymlink && err != nil, then this symlink resolution error 405 // must be handled by the VFS layer. 406 return nil, err 407 } 408 start = &parentDir.dentry 409 goto afterTrailingSymlink 410 } 411 if linuxerr.Equals(linuxerr.ENOENT, err) { 412 // Already checked for searchability above; now check for writability. 413 if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { 414 return nil, err 415 } 416 if err := rp.Mount().CheckBeginWrite(); err != nil { 417 return nil, err 418 } 419 defer rp.Mount().EndWrite() 420 // Create and open the child. 421 creds := rp.Credentials() 422 child := fs.newDentry(fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, opts.Mode, parentDir)) 423 parentDir.insertChildLocked(child, name) 424 child.IncRef() 425 defer child.DecRef(ctx) 426 unlock() 427 fd, err := child.open(ctx, rp, &opts, true) 428 if err != nil { 429 return nil, err 430 } 431 parentDir.inode.watches.Notify(ctx, name, linux.IN_CREATE, 0, vfs.PathEvent, false /* unlinked */) 432 parentDir.inode.touchCMtime() 433 return fd, nil 434 } 435 if err != nil { 436 return nil, err 437 } 438 if mustCreate { 439 return nil, linuxerr.EEXIST 440 } 441 if rp.MustBeDir() && !child.inode.isDir() { 442 return nil, linuxerr.ENOTDIR 443 } 444 child.IncRef() 445 defer child.DecRef(ctx) 446 unlock() 447 return child.open(ctx, rp, &opts, false) 448 } 449 450 // Preconditions: The caller must hold no locks (since opening pipes may block 451 // indefinitely). 452 func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions, afterCreate bool) (*vfs.FileDescription, error) { 453 ats := vfs.AccessTypesForOpenFlags(opts) 454 if !afterCreate { 455 if err := d.inode.checkPermissions(rp.Credentials(), ats); err != nil { 456 return nil, err 457 } 458 } 459 switch impl := d.inode.impl.(type) { 460 case *regularFile: 461 var fd regularFileFD 462 fd.LockFD.Init(&d.inode.locks) 463 if err := fd.vfsfd.Init(&fd, opts.Flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{AllowDirectIO: true}); err != nil { 464 return nil, err 465 } 466 if !afterCreate && opts.Flags&linux.O_TRUNC != 0 { 467 if _, err := impl.truncate(0); err != nil { 468 return nil, err 469 } 470 } 471 if fd.vfsfd.IsWritable() { 472 fsmetric.TmpfsOpensW.Increment() 473 } else if fd.vfsfd.IsReadable() { 474 fsmetric.TmpfsOpensRO.Increment() 475 } 476 return &fd.vfsfd, nil 477 case *directory: 478 // Can't open directories with O_CREAT. 479 if opts.Flags&linux.O_CREAT != 0 { 480 return nil, linuxerr.EISDIR 481 } 482 // Can't open directories writably. 483 if ats&vfs.MayWrite != 0 { 484 return nil, linuxerr.EISDIR 485 } 486 if opts.Flags&linux.O_DIRECT != 0 { 487 return nil, linuxerr.EINVAL 488 } 489 var fd directoryFD 490 fd.LockFD.Init(&d.inode.locks) 491 if err := fd.vfsfd.Init(&fd, opts.Flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{AllowDirectIO: true}); err != nil { 492 return nil, err 493 } 494 return &fd.vfsfd, nil 495 case *symlink: 496 // Can't open symlinks without O_PATH, which is handled at the VFS layer. 497 return nil, linuxerr.ELOOP 498 case *namedPipe: 499 return impl.pipe.Open(ctx, rp.Mount(), &d.vfsd, opts.Flags, &d.inode.locks) 500 case *deviceFile: 501 return rp.VirtualFilesystem().OpenDeviceSpecialFile(ctx, rp.Mount(), &d.vfsd, impl.kind, impl.major, impl.minor, opts) 502 case *socketFile: 503 return nil, linuxerr.ENXIO 504 default: 505 panic(fmt.Sprintf("unknown inode type: %T", d.inode.impl)) 506 } 507 } 508 509 // ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt. 510 func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) { 511 fs.mu.RLock() 512 defer fs.mu.RUnlock() 513 d, err := resolveLocked(ctx, rp) 514 if err != nil { 515 return "", err 516 } 517 symlink, ok := d.inode.impl.(*symlink) 518 if !ok { 519 return "", linuxerr.EINVAL 520 } 521 symlink.inode.touchAtime(rp.Mount()) 522 return symlink.target, nil 523 } 524 525 // RenameAt implements vfs.FilesystemImpl.RenameAt. 526 func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error { 527 // Resolve newParentDir first to verify that it's on this Mount. 528 fs.mu.Lock() 529 defer fs.mu.Unlock() 530 newParentDir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry)) 531 if err != nil { 532 return err 533 } 534 535 if opts.Flags&^linux.RENAME_NOREPLACE != 0 { 536 // TODO(b/145974740): Support other renameat2 flags. 537 return linuxerr.EINVAL 538 } 539 540 newName := rp.Component() 541 if newName == "." || newName == ".." { 542 if opts.Flags&linux.RENAME_NOREPLACE != 0 { 543 return linuxerr.EEXIST 544 } 545 return linuxerr.EBUSY 546 } 547 if len(newName) > fs.maxFilenameLen { 548 return linuxerr.ENAMETOOLONG 549 } 550 mnt := rp.Mount() 551 if mnt != oldParentVD.Mount() { 552 return linuxerr.EXDEV 553 } 554 if err := mnt.CheckBeginWrite(); err != nil { 555 return err 556 } 557 defer mnt.EndWrite() 558 559 oldParentDir := oldParentVD.Dentry().Impl().(*dentry).inode.impl.(*directory) 560 if err := oldParentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { 561 return err 562 } 563 renamed, ok := oldParentDir.childMap[oldName] 564 if !ok { 565 return linuxerr.ENOENT 566 } 567 if err := oldParentDir.mayDelete(rp.Credentials(), renamed); err != nil { 568 return err 569 } 570 // Note that we don't need to call rp.CheckMount(), since if renamed is a 571 // mount point then we want to rename the mount point, not anything in the 572 // mounted filesystem. 573 if renamed.inode.isDir() { 574 if renamed == &newParentDir.dentry || genericIsAncestorDentry(renamed, &newParentDir.dentry) { 575 return linuxerr.EINVAL 576 } 577 if oldParentDir != newParentDir { 578 // Writability is needed to change renamed's "..". 579 if err := renamed.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { 580 return err 581 } 582 } 583 } else { 584 if opts.MustBeDir || rp.MustBeDir() { 585 return linuxerr.ENOTDIR 586 } 587 } 588 589 if err := newParentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { 590 return err 591 } 592 replaced, ok := newParentDir.childMap[newName] 593 if ok { 594 if opts.Flags&linux.RENAME_NOREPLACE != 0 { 595 return linuxerr.EEXIST 596 } 597 replacedDir, ok := replaced.inode.impl.(*directory) 598 if ok { 599 if !renamed.inode.isDir() { 600 return linuxerr.EISDIR 601 } 602 if len(replacedDir.childMap) != 0 { 603 return linuxerr.ENOTEMPTY 604 } 605 } else { 606 if rp.MustBeDir() { 607 return linuxerr.ENOTDIR 608 } 609 if renamed.inode.isDir() { 610 return linuxerr.ENOTDIR 611 } 612 } 613 } else { 614 if renamed.inode.isDir() && newParentDir.inode.nlink.Load() == maxLinks { 615 return linuxerr.EMLINK 616 } 617 } 618 // tmpfs never calls VFS.InvalidateDentry(), so newParentDir.dentry can 619 // only be dead if it was deleted. 620 if newParentDir.dentry.vfsd.IsDead() { 621 return linuxerr.ENOENT 622 } 623 624 // Linux places this check before some of those above; we do it here for 625 // simplicity, under the assumption that applications are not intentionally 626 // doing noop renames expecting them to succeed where non-noop renames 627 // would fail. 628 if renamed == replaced { 629 return nil 630 } 631 vfsObj := rp.VirtualFilesystem() 632 mntns := vfs.MountNamespaceFromContext(ctx) 633 defer mntns.DecRef(ctx) 634 var replacedVFSD *vfs.Dentry 635 if replaced != nil { 636 replacedVFSD = &replaced.vfsd 637 } 638 if err := vfsObj.PrepareRenameDentry(mntns, &renamed.vfsd, replacedVFSD); err != nil { 639 return err 640 } 641 if replaced != nil { 642 newParentDir.removeChildLocked(replaced) 643 if replaced.inode.isDir() { 644 // Remove links for replaced/. and replaced/.. 645 replaced.inode.decLinksLocked(ctx) 646 newParentDir.inode.decLinksLocked(ctx) 647 } 648 replaced.inode.decLinksLocked(ctx) 649 } 650 oldParentDir.removeChildLocked(renamed) 651 newParentDir.insertChildLocked(renamed, newName) 652 vfsObj.CommitRenameReplaceDentry(ctx, &renamed.vfsd, replacedVFSD) 653 oldParentDir.inode.touchCMtime() 654 if oldParentDir != newParentDir { 655 if renamed.inode.isDir() { 656 oldParentDir.inode.decLinksLocked(ctx) 657 newParentDir.inode.incLinksLocked() 658 } 659 newParentDir.inode.touchCMtime() 660 } 661 renamed.inode.touchCtime() 662 663 vfs.InotifyRename(ctx, &renamed.inode.watches, &oldParentDir.inode.watches, &newParentDir.inode.watches, oldName, newName, renamed.inode.isDir()) 664 return nil 665 } 666 667 // RmdirAt implements vfs.FilesystemImpl.RmdirAt. 668 func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error { 669 fs.mu.Lock() 670 defer fs.mu.Unlock() 671 parentDir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry)) 672 if err != nil { 673 return err 674 } 675 if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { 676 return err 677 } 678 name := rp.Component() 679 if name == "." { 680 return linuxerr.EINVAL 681 } 682 if name == ".." { 683 return linuxerr.ENOTEMPTY 684 } 685 child, ok := parentDir.childMap[name] 686 if !ok { 687 return linuxerr.ENOENT 688 } 689 if err := parentDir.mayDelete(rp.Credentials(), child); err != nil { 690 return err 691 } 692 childDir, ok := child.inode.impl.(*directory) 693 if !ok { 694 return linuxerr.ENOTDIR 695 } 696 if len(childDir.childMap) != 0 { 697 return linuxerr.ENOTEMPTY 698 } 699 mnt := rp.Mount() 700 if err := mnt.CheckBeginWrite(); err != nil { 701 return err 702 } 703 defer mnt.EndWrite() 704 vfsObj := rp.VirtualFilesystem() 705 mntns := vfs.MountNamespaceFromContext(ctx) 706 defer mntns.DecRef(ctx) 707 if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil { 708 return err 709 } 710 parentDir.removeChildLocked(child) 711 parentDir.inode.watches.Notify(ctx, name, linux.IN_DELETE|linux.IN_ISDIR, 0, vfs.InodeEvent, true /* unlinked */) 712 // Remove links for child, child/., and child/.. 713 child.inode.decLinksLocked(ctx) 714 child.inode.decLinksLocked(ctx) 715 parentDir.inode.decLinksLocked(ctx) 716 vfsObj.CommitDeleteDentry(ctx, &child.vfsd) 717 parentDir.inode.touchCMtime() 718 return nil 719 } 720 721 // SetStatAt implements vfs.FilesystemImpl.SetStatAt. 722 func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error { 723 fs.mu.RLock() 724 d, err := resolveLocked(ctx, rp) 725 if err != nil { 726 fs.mu.RUnlock() 727 return err 728 } 729 err = d.inode.setStat(ctx, rp.Credentials(), &opts) 730 fs.mu.RUnlock() 731 if err != nil { 732 return err 733 } 734 735 if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 { 736 d.InotifyWithParent(ctx, ev, 0, vfs.InodeEvent) 737 } 738 return nil 739 } 740 741 // StatAt implements vfs.FilesystemImpl.StatAt. 742 func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) { 743 fs.mu.RLock() 744 defer fs.mu.RUnlock() 745 d, err := resolveLocked(ctx, rp) 746 if err != nil { 747 return linux.Statx{}, err 748 } 749 var stat linux.Statx 750 d.inode.statTo(&stat) 751 return stat, nil 752 } 753 754 // StatFSAt implements vfs.FilesystemImpl.StatFSAt. 755 func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) { 756 fs.mu.RLock() 757 defer fs.mu.RUnlock() 758 if _, err := resolveLocked(ctx, rp); err != nil { 759 return linux.Statfs{}, err 760 } 761 return fs.statFS(), nil 762 } 763 764 // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt. 765 func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error { 766 return fs.doCreateAt(ctx, rp, false /* dir */, func(parentDir *directory, name string) error { 767 // Linux allocates a page to store symlink targets that have length larger 768 // than shortSymlinkLen. Targets are just stored as string here, but simulate 769 // the page accounting for it. See mm/shmem.c:shmem_symlink(). 770 if len(target) >= shortSymlinkLen { 771 if !fs.accountPages(1) { 772 return linuxerr.ENOSPC 773 } 774 } 775 creds := rp.Credentials() 776 child := fs.newDentry(fs.newSymlink(creds.EffectiveKUID, creds.EffectiveKGID, 0777, target, parentDir)) 777 parentDir.insertChildLocked(child, name) 778 return nil 779 }) 780 } 781 782 // UnlinkAt implements vfs.FilesystemImpl.UnlinkAt. 783 func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error { 784 fs.mu.Lock() 785 defer fs.mu.Unlock() 786 parentDir, err := walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry)) 787 if err != nil { 788 return err 789 } 790 if err := parentDir.inode.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil { 791 return err 792 } 793 name := rp.Component() 794 if name == "." || name == ".." { 795 return linuxerr.EISDIR 796 } 797 child, ok := parentDir.childMap[name] 798 if !ok { 799 return linuxerr.ENOENT 800 } 801 if err := parentDir.mayDelete(rp.Credentials(), child); err != nil { 802 return err 803 } 804 if child.inode.isDir() { 805 return linuxerr.EISDIR 806 } 807 if rp.MustBeDir() { 808 return linuxerr.ENOTDIR 809 } 810 mnt := rp.Mount() 811 if err := mnt.CheckBeginWrite(); err != nil { 812 return err 813 } 814 defer mnt.EndWrite() 815 vfsObj := rp.VirtualFilesystem() 816 mntns := vfs.MountNamespaceFromContext(ctx) 817 defer mntns.DecRef(ctx) 818 if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil { 819 return err 820 } 821 // Generate inotify events. Note that this must take place before the link 822 // count of the child is decremented, or else the watches may be dropped 823 // before these events are added. 824 vfs.InotifyRemoveChild(ctx, &child.inode.watches, &parentDir.inode.watches, name) 825 parentDir.removeChildLocked(child) 826 child.inode.decLinksLocked(ctx) 827 vfsObj.CommitDeleteDentry(ctx, &child.vfsd) 828 parentDir.inode.touchCMtime() 829 return nil 830 } 831 832 // BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt. 833 func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) { 834 fs.mu.RLock() 835 defer fs.mu.RUnlock() 836 d, err := resolveLocked(ctx, rp) 837 if err != nil { 838 return nil, err 839 } 840 if err := d.inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil { 841 return nil, err 842 } 843 switch impl := d.inode.impl.(type) { 844 case *socketFile: 845 if impl.ep == nil { 846 return nil, linuxerr.ECONNREFUSED 847 } 848 return impl.ep, nil 849 default: 850 return nil, linuxerr.ECONNREFUSED 851 } 852 } 853 854 // ListXattrAt implements vfs.FilesystemImpl.ListXattrAt. 855 func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) { 856 fs.mu.RLock() 857 defer fs.mu.RUnlock() 858 d, err := resolveLocked(ctx, rp) 859 if err != nil { 860 return nil, err 861 } 862 return d.inode.listXattr(rp.Credentials(), size) 863 } 864 865 // GetXattrAt implements vfs.FilesystemImpl.GetXattrAt. 866 func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) { 867 fs.mu.RLock() 868 defer fs.mu.RUnlock() 869 d, err := resolveLocked(ctx, rp) 870 if err != nil { 871 return "", err 872 } 873 return d.inode.getXattr(rp.Credentials(), &opts) 874 } 875 876 // SetXattrAt implements vfs.FilesystemImpl.SetXattrAt. 877 func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error { 878 fs.mu.RLock() 879 d, err := resolveLocked(ctx, rp) 880 if err != nil { 881 fs.mu.RUnlock() 882 return err 883 } 884 err = d.inode.setXattr(rp.Credentials(), &opts) 885 fs.mu.RUnlock() 886 if err != nil { 887 return err 888 } 889 890 d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent) 891 return nil 892 } 893 894 // RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt. 895 func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error { 896 fs.mu.RLock() 897 d, err := resolveLocked(ctx, rp) 898 if err != nil { 899 fs.mu.RUnlock() 900 return err 901 } 902 err = d.inode.removeXattr(rp.Credentials(), name) 903 fs.mu.RUnlock() 904 if err != nil { 905 return err 906 } 907 908 d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent) 909 return nil 910 } 911 912 // PrependPath implements vfs.FilesystemImpl.PrependPath. 913 func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error { 914 fs.mu.RLock() 915 defer fs.mu.RUnlock() 916 mnt := vd.Mount() 917 d := vd.Dentry().Impl().(*dentry) 918 for { 919 if mnt == vfsroot.Mount() && &d.vfsd == vfsroot.Dentry() { 920 return vfs.PrependPathAtVFSRootError{} 921 } 922 if mnt != nil && &d.vfsd == mnt.Root() { 923 return nil 924 } 925 if d.parent == nil { 926 if d.name != "" { 927 // This file must have been created by 928 // newUnlinkedRegularFileDescription(). In Linux, 929 // mm/shmem.c:__shmem_file_setup() => 930 // fs/file_table.c:alloc_file_pseudo() sets the created 931 // dentry's dentry_operations to anon_ops, for which d_dname == 932 // simple_dname. fs/d_path.c:simple_dname() defines the 933 // dentry's pathname to be its name, prefixed with "/" and 934 // suffixed with " (deleted)". 935 b.PrependComponent("/" + d.name) 936 b.AppendString(" (deleted)") 937 return vfs.PrependPathSyntheticError{} 938 } 939 return vfs.PrependPathAtNonMountRootError{} 940 } 941 b.PrependComponent(d.name) 942 d = d.parent 943 } 944 } 945 946 // MountOptions implements vfs.FilesystemImpl.MountOptions. 947 func (fs *filesystem) MountOptions() string { 948 return fs.mopts 949 } 950 951 // adjustPageAcct adjusts the accounting done against filesystem size limit in 952 // case there is any discrepency between the number of pages reserved vs the 953 // number of pages actually allocated. 954 func (fs *filesystem) adjustPageAcct(reserved, alloced uint64) { 955 if reserved < alloced { 956 panic(fmt.Sprintf("More pages were allocated than the pages reserved: reserved=%d, alloced=%d", reserved, alloced)) 957 } 958 if pagesDiff := reserved - alloced; pagesDiff > 0 { 959 fs.unaccountPages(pagesDiff) 960 } 961 } 962 963 // accountPagesPartial increases the pagesUsed if tmpfs is mounted with size 964 // option by as much as possible without going over the size mount option. It 965 // returns the number of pages that we were able to account for. It returns false 966 // when the maxSizeInPages has been exhausted and no more allocation can be done. 967 // The returned value is guaranteed to be <= pagesInc. If the size mount option is 968 // not set, then pagesInc will be returned. 969 func (fs *filesystem) accountPagesPartial(pagesInc uint64) uint64 { 970 if pagesInc == 0 { 971 return pagesInc 972 } 973 974 for { 975 pagesUsed := fs.pagesUsed.Load() 976 if fs.maxSizeInPages <= pagesUsed { 977 return 0 978 } 979 980 pagesFree := fs.maxSizeInPages - pagesUsed 981 toInc := pagesInc 982 if pagesFree < pagesInc { 983 toInc = pagesFree 984 } 985 986 if fs.pagesUsed.CompareAndSwap(pagesUsed, pagesUsed+toInc) { 987 return toInc 988 } 989 } 990 } 991 992 // accountPages increases the pagesUsed in filesystem struct if tmpfs 993 // is mounted with size option. We return a false when the maxSizeInPages 994 // has been exhausted and no more allocation can be done. 995 func (fs *filesystem) accountPages(pagesInc uint64) bool { 996 if pagesInc == 0 { 997 return true // No accounting needed. 998 } 999 1000 for { 1001 pagesUsed := fs.pagesUsed.Load() 1002 if fs.maxSizeInPages <= pagesUsed { 1003 return false 1004 } 1005 1006 pagesFree := fs.maxSizeInPages - pagesUsed 1007 if pagesFree < pagesInc { 1008 return false 1009 } 1010 1011 if fs.pagesUsed.CompareAndSwap(pagesUsed, pagesUsed+pagesInc) { 1012 return true 1013 } 1014 } 1015 } 1016 1017 // unaccountPages decreases the pagesUsed in filesystem struct if tmpfs 1018 // is mounted with size option. 1019 func (fs *filesystem) unaccountPages(pagesDec uint64) { 1020 if pagesDec == 0 { 1021 return 1022 } 1023 1024 for { 1025 pagesUsed := fs.pagesUsed.Load() 1026 if pagesUsed < pagesDec { 1027 panic(fmt.Sprintf("Deallocating more pages than allocated: fs.pagesUsed = %d, pagesDec = %d", pagesUsed, pagesDec)) 1028 } 1029 if fs.pagesUsed.CompareAndSwap(pagesUsed, pagesUsed-pagesDec) { 1030 break 1031 } 1032 } 1033 }