github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/sentry/fsimpl/tmpfs/tmpfs.go (about) 1 // Copyright 2019 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package tmpfs provides an in-memory filesystem whose contents are 16 // application-mutable, consistent with Linux's tmpfs. 17 // 18 // Lock order: 19 // 20 // filesystem.mu 21 // inode.mu 22 // regularFileFD.offMu 23 // *** "memmap.Mappable locks" below this point 24 // regularFile.mapsMu 25 // *** "memmap.Mappable locks taken by Translate" below this point 26 // regularFile.dataMu 27 // fs.pagesUsedMu 28 // directory.iterMu 29 package tmpfs 30 31 import ( 32 "fmt" 33 "math" 34 "strconv" 35 "strings" 36 37 "github.com/MerlinKodo/gvisor/pkg/abi/linux" 38 "github.com/MerlinKodo/gvisor/pkg/atomicbitops" 39 "github.com/MerlinKodo/gvisor/pkg/context" 40 "github.com/MerlinKodo/gvisor/pkg/errors/linuxerr" 41 "github.com/MerlinKodo/gvisor/pkg/fd" 42 "github.com/MerlinKodo/gvisor/pkg/hostarch" 43 "github.com/MerlinKodo/gvisor/pkg/sentry/kernel/auth" 44 "github.com/MerlinKodo/gvisor/pkg/sentry/kernel/time" 45 "github.com/MerlinKodo/gvisor/pkg/sentry/pgalloc" 46 "github.com/MerlinKodo/gvisor/pkg/sentry/usage" 47 "github.com/MerlinKodo/gvisor/pkg/sentry/vfs" 48 "github.com/MerlinKodo/gvisor/pkg/sentry/vfs/memxattr" 49 ) 50 51 // Name is the default filesystem name. 52 const Name = "tmpfs" 53 54 // FilesystemType implements vfs.FilesystemType. 55 // 56 // +stateify savable 57 type FilesystemType struct{} 58 59 // filesystem implements vfs.FilesystemImpl. 60 // 61 // +stateify savable 62 type filesystem struct { 63 vfsfs vfs.Filesystem 64 65 // mf is used to allocate memory that stores regular file contents. mf is 66 // immutable, except it may to changed during restore. 67 mf *pgalloc.MemoryFile `state:"nosave"` 68 69 // privateMF indicates whether mf is private to this tmpfs mount. If so, 70 // tmpfs takes ownership of mf. privateMF is immutable. 71 privateMF bool 72 73 // mfp is used to provide mf, when privateMF == false. This is required to 74 // re-provide mf on restore. mfp is immutable. 75 mfp pgalloc.MemoryFileProvider 76 77 // clock is a realtime clock used to set timestamps in file operations. 78 clock time.Clock 79 80 // devMinor is the filesystem's minor device number. devMinor is immutable. 81 devMinor uint32 82 83 // mopts contains the tmpfs-specific mount options passed to this 84 // filesystem. Immutable. 85 mopts string 86 87 // usage is the memory accounting category under which pages backing 88 // files in this filesystem are accounted. 89 usage usage.MemoryKind 90 91 // mu serializes changes to the Dentry tree. 92 mu filesystemRWMutex `state:"nosave"` 93 94 nextInoMinusOne atomicbitops.Uint64 // accessed using atomic memory operations 95 96 root *dentry 97 98 maxFilenameLen int 99 100 // maxSizeInPages is the maximum permissible size for the tmpfs in terms of pages. 101 // This field is immutable. 102 maxSizeInPages uint64 103 104 // pagesUsed is the number of pages used by this filesystem. 105 pagesUsed atomicbitops.Uint64 106 107 // allowXattrPrefix is a set of xattr namespace prefixes that this 108 // tmpfs mount will allow. It is immutable. 109 allowXattrPrefix map[string]struct{} 110 } 111 112 // Name implements vfs.FilesystemType.Name. 113 func (FilesystemType) Name() string { 114 return Name 115 } 116 117 // Release implements vfs.FilesystemType.Release. 118 func (FilesystemType) Release(ctx context.Context) {} 119 120 // FilesystemOpts is used to pass configuration data to tmpfs. 121 // 122 // +stateify savable 123 type FilesystemOpts struct { 124 // RootFileType is the FileType of the filesystem root. Valid values 125 // are: S_IFDIR, S_IFREG, and S_IFLNK. Defaults to S_IFDIR. 126 RootFileType uint16 127 128 // RootSymlinkTarget is the target of the root symlink. Only valid if 129 // RootFileType == S_IFLNK. 130 RootSymlinkTarget string 131 132 // FilesystemType allows setting a different FilesystemType for this 133 // tmpfs filesystem. This allows tmpfs to "impersonate" other 134 // filesystems, like ramdiskfs and cgroupfs. 135 FilesystemType vfs.FilesystemType 136 137 // Usage is the memory accounting category under which pages backing files in 138 // the filesystem are accounted. 139 Usage *usage.MemoryKind 140 141 // MaxFilenameLen is the maximum filename length allowed by the tmpfs. 142 MaxFilenameLen int 143 144 // FilestoreFD is the FD for the memory file that will be used to store file 145 // data. If this is nil, then MemoryFileProviderFromContext() is used. 146 FilestoreFD *fd.FD 147 148 // DisableDefaultSizeLimit disables setting a default size limit. In Linux, 149 // SB_KERNMOUNT has this effect on tmpfs mounts; see mm/shmem.c:shmem_fill_super(). 150 DisableDefaultSizeLimit bool 151 152 // AllowXattrPrefix is a set of xattr namespace prefixes that this 153 // tmpfs mount will allow. 154 AllowXattrPrefix []string 155 } 156 157 // Default size limit mount option. It is immutable after initialization. 158 var defaultSizeLimit uint64 159 160 // SetDefaultSizeLimit configures the size limit to be used for tmpfs mounts 161 // that do not specify a size= mount option. This must be called only once, 162 // before any tmpfs filesystems are created. 163 func SetDefaultSizeLimit(sizeLimit uint64) { 164 defaultSizeLimit = sizeLimit 165 } 166 167 func getDefaultSizeLimit(disable bool) uint64 { 168 if disable || defaultSizeLimit == 0 { 169 // The size limit is used to populate statfs(2) results. If Linux tmpfs is 170 // mounted with no size option, then statfs(2) returns f_blocks == f_bfree 171 // == f_bavail == 0. However, many applications treat this as having a size 172 // limit of 0. To work around this, return a very large but non-zero size 173 // limit, chosen to ensure that it does not overflow int64. 174 return math.MaxInt64 175 } 176 return defaultSizeLimit 177 } 178 179 // GetFilesystem implements vfs.FilesystemType.GetFilesystem. 180 func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, _ string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { 181 mfp := pgalloc.MemoryFileProviderFromContext(ctx) 182 if mfp == nil { 183 panic("MemoryFileProviderFromContext returned nil") 184 } 185 mf := mfp.MemoryFile() 186 privateMF := false 187 188 rootFileType := uint16(linux.S_IFDIR) 189 disableDefaultSizeLimit := false 190 newFSType := vfs.FilesystemType(&fstype) 191 192 // By default we support only "trusted" and "user" namespaces. Linux 193 // also supports "security" and (if configured) POSIX ACL namespaces 194 // "system.posix_acl_access" and "system.posix_acl_default". 195 allowXattrPrefix := map[string]struct{}{ 196 linux.XATTR_TRUSTED_PREFIX: struct{}{}, 197 linux.XATTR_USER_PREFIX: struct{}{}, 198 // The "security" namespace is allowed, but it always returns an error. 199 linux.XATTR_SECURITY_PREFIX: struct{}{}, 200 } 201 202 tmpfsOpts, tmpfsOptsOk := opts.InternalData.(FilesystemOpts) 203 if tmpfsOptsOk { 204 if tmpfsOpts.RootFileType != 0 { 205 rootFileType = tmpfsOpts.RootFileType 206 } 207 if tmpfsOpts.FilesystemType != nil { 208 newFSType = tmpfsOpts.FilesystemType 209 } 210 disableDefaultSizeLimit = tmpfsOpts.DisableDefaultSizeLimit 211 if tmpfsOpts.FilestoreFD != nil { 212 mfOpts := pgalloc.MemoryFileOpts{ 213 // tmpfsOpts.FilestoreFD may be backed by a file on disk (not memfd), 214 // which needs to be decommited on destroy to release disk space. 215 DecommitOnDestroy: true, 216 // sentry's seccomp filters don't allow the mmap(2) syscalls that 217 // pgalloc.IMAWorkAroundForMemFile() uses. Users of tmpfsOpts.FilestoreFD 218 // are expected to have performed the work around outside the sandbox. 219 DisableIMAWorkAround: true, 220 // Custom filestore FDs are usually backed by files on disk. Ideally we 221 // would confirm with fstatfs(2) but that is prohibited by seccomp. 222 DiskBackedFile: true, 223 } 224 var err error 225 mf, err = pgalloc.NewMemoryFile(tmpfsOpts.FilestoreFD.ReleaseToFile("overlay-filestore"), mfOpts) 226 if err != nil { 227 ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: pgalloc.NewMemoryFile failed: %v", err) 228 return nil, nil, err 229 } 230 privateMF = true 231 } 232 233 for _, xattr := range tmpfsOpts.AllowXattrPrefix { 234 allowXattrPrefix[xattr] = struct{}{} 235 } 236 } 237 238 mopts := vfs.GenericParseMountOptions(opts.Data) 239 rootMode := linux.FileMode(0777) 240 if rootFileType == linux.S_IFDIR { 241 rootMode = 01777 242 } 243 modeStr, ok := mopts["mode"] 244 if ok { 245 delete(mopts, "mode") 246 mode, err := strconv.ParseUint(modeStr, 8, 32) 247 if err != nil { 248 ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid mode: %q", modeStr) 249 return nil, nil, linuxerr.EINVAL 250 } 251 rootMode = linux.FileMode(mode & 07777) 252 } 253 rootKUID := creds.EffectiveKUID 254 uidStr, ok := mopts["uid"] 255 if ok { 256 delete(mopts, "uid") 257 uid, err := strconv.ParseUint(uidStr, 10, 32) 258 if err != nil { 259 ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid uid: %q", uidStr) 260 return nil, nil, linuxerr.EINVAL 261 } 262 kuid := creds.UserNamespace.MapToKUID(auth.UID(uid)) 263 if !kuid.Ok() { 264 ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unmapped uid: %d", uid) 265 return nil, nil, linuxerr.EINVAL 266 } 267 rootKUID = kuid 268 } 269 rootKGID := creds.EffectiveKGID 270 gidStr, ok := mopts["gid"] 271 if ok { 272 delete(mopts, "gid") 273 gid, err := strconv.ParseUint(gidStr, 10, 32) 274 if err != nil { 275 ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid gid: %q", gidStr) 276 return nil, nil, linuxerr.EINVAL 277 } 278 kgid := creds.UserNamespace.MapToKGID(auth.GID(gid)) 279 if !kgid.Ok() { 280 ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unmapped gid: %d", gid) 281 return nil, nil, linuxerr.EINVAL 282 } 283 rootKGID = kgid 284 } 285 maxSizeInPages := getDefaultSizeLimit(disableDefaultSizeLimit) / hostarch.PageSize 286 maxSizeStr, ok := mopts["size"] 287 if ok { 288 delete(mopts, "size") 289 maxSizeInBytes, err := parseSize(maxSizeStr) 290 if err != nil { 291 ctx.Debugf("tmpfs.FilesystemType.GetFilesystem: parseSize() failed: %v", err) 292 return nil, nil, linuxerr.EINVAL 293 } 294 // Convert size in bytes to nearest Page Size bytes 295 // as Linux allocates memory in terms of Page size. 296 maxSizeInPages, ok = hostarch.ToPagesRoundUp(maxSizeInBytes) 297 if !ok { 298 ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: Pages RoundUp Overflow error: %q", ok) 299 return nil, nil, linuxerr.EINVAL 300 } 301 } 302 303 if len(mopts) != 0 { 304 ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unknown options: %v", mopts) 305 return nil, nil, linuxerr.EINVAL 306 } 307 308 devMinor, err := vfsObj.GetAnonBlockDevMinor() 309 if err != nil { 310 return nil, nil, err 311 } 312 clock := time.RealtimeClockFromContext(ctx) 313 memUsage := usage.Tmpfs 314 if tmpfsOpts.Usage != nil { 315 memUsage = *tmpfsOpts.Usage 316 } 317 fs := filesystem{ 318 mf: mf, 319 privateMF: privateMF, 320 mfp: mfp, 321 clock: clock, 322 devMinor: devMinor, 323 mopts: opts.Data, 324 usage: memUsage, 325 maxFilenameLen: linux.NAME_MAX, 326 maxSizeInPages: maxSizeInPages, 327 allowXattrPrefix: allowXattrPrefix, 328 } 329 fs.vfsfs.Init(vfsObj, newFSType, &fs) 330 if tmpfsOptsOk && tmpfsOpts.MaxFilenameLen > 0 { 331 fs.maxFilenameLen = tmpfsOpts.MaxFilenameLen 332 } 333 334 var root *dentry 335 switch rootFileType { 336 case linux.S_IFREG: 337 root = fs.newDentry(fs.newRegularFile(rootKUID, rootKGID, rootMode, nil /* parentDir */)) 338 case linux.S_IFLNK: 339 root = fs.newDentry(fs.newSymlink(rootKUID, rootKGID, rootMode, tmpfsOpts.RootSymlinkTarget, nil /* parentDir */)) 340 case linux.S_IFDIR: 341 root = &fs.newDirectory(rootKUID, rootKGID, rootMode, nil /* parentDir */).dentry 342 default: 343 fs.vfsfs.DecRef(ctx) 344 return nil, nil, fmt.Errorf("invalid tmpfs root file type: %#o", rootFileType) 345 } 346 fs.root = root 347 return &fs.vfsfs, &root.vfsd, nil 348 } 349 350 // Release implements vfs.FilesystemImpl.Release. 351 func (fs *filesystem) Release(ctx context.Context) { 352 fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) 353 fs.mu.Lock() 354 if fs.root.inode.isDir() { 355 fs.root.releaseChildrenLocked(ctx) 356 } 357 fs.mu.Unlock() 358 if fs.privateMF { 359 fs.mf.Destroy() 360 } 361 } 362 363 // releaseChildrenLocked is called on the mount point by filesystem.Release() to 364 // destroy all objects in the mount. It performs a depth-first walk of the 365 // filesystem and "unlinks" everything by decrementing link counts 366 // appropriately. There should be no open file descriptors when this is called, 367 // so each inode should only have one outstanding reference that is removed once 368 // its link count hits zero. 369 // 370 // Note that we do not update filesystem state precisely while tearing down (for 371 // instance, the child maps are ignored)--we only care to remove all remaining 372 // references so that every filesystem object gets destroyed. Also note that we 373 // do not need to trigger DecRef on the mount point itself or any child mount; 374 // these are taken care of by the destructor of the enclosing MountNamespace. 375 // 376 // Precondition: filesystem.mu is held. 377 func (d *dentry) releaseChildrenLocked(ctx context.Context) { 378 dir := d.inode.impl.(*directory) 379 for _, child := range dir.childMap { 380 if child.inode.isDir() { 381 child.releaseChildrenLocked(ctx) 382 child.inode.decLinksLocked(ctx) // link for child/. 383 dir.inode.decLinksLocked(ctx) // link for child/.. 384 } 385 child.inode.decLinksLocked(ctx) // link for child 386 } 387 } 388 389 func (fs *filesystem) statFS() linux.Statfs { 390 st := linux.Statfs{ 391 Type: linux.TMPFS_MAGIC, 392 BlockSize: hostarch.PageSize, 393 FragmentSize: hostarch.PageSize, 394 NameLength: linux.NAME_MAX, 395 } 396 397 // If size is set for tmpfs return set values. 398 st.Blocks = fs.maxSizeInPages 399 pagesUsed := fs.pagesUsed.Load() 400 st.BlocksFree = fs.maxSizeInPages - pagesUsed 401 st.BlocksAvailable = fs.maxSizeInPages - pagesUsed 402 return st 403 } 404 405 // dentry implements vfs.DentryImpl. 406 // 407 // +stateify savable 408 type dentry struct { 409 vfsd vfs.Dentry 410 411 // parent is this dentry's parent directory. Each referenced dentry holds a 412 // reference on parent.dentry. If this dentry is a filesystem root, parent 413 // is nil. parent is protected by filesystem.mu. 414 parent *dentry 415 416 // name is the name of this dentry in its parent. If this dentry is a 417 // filesystem root, name is the empty string. name is protected by 418 // filesystem.mu. 419 name string 420 421 // dentryEntry (ugh) links dentries into their parent directory.childList. 422 dentryEntry 423 424 // inode is the inode represented by this dentry. Multiple Dentries may 425 // share a single non-directory inode (with hard links). inode is 426 // immutable. 427 // 428 // tmpfs doesn't count references on dentries; because the dentry tree is 429 // the sole source of truth, it is by definition always consistent with the 430 // state of the filesystem. However, it does count references on inodes, 431 // because inode resources are released when all references are dropped. 432 // dentry therefore forwards reference counting directly to inode. 433 inode *inode 434 } 435 436 func (fs *filesystem) newDentry(inode *inode) *dentry { 437 d := &dentry{ 438 inode: inode, 439 } 440 d.vfsd.Init(d) 441 return d 442 } 443 444 // IncRef implements vfs.DentryImpl.IncRef. 445 func (d *dentry) IncRef() { 446 d.inode.incRef() 447 } 448 449 // TryIncRef implements vfs.DentryImpl.TryIncRef. 450 func (d *dentry) TryIncRef() bool { 451 return d.inode.tryIncRef() 452 } 453 454 // DecRef implements vfs.DentryImpl.DecRef. 455 func (d *dentry) DecRef(ctx context.Context) { 456 d.inode.decRef(ctx) 457 } 458 459 // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. 460 func (d *dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) { 461 if d.inode.isDir() { 462 events |= linux.IN_ISDIR 463 } 464 465 // tmpfs never calls VFS.InvalidateDentry(), so d.vfsd.IsDead() indicates 466 // that d was deleted. 467 deleted := d.vfsd.IsDead() 468 469 d.inode.fs.mu.RLock() 470 // The ordering below is important, Linux always notifies the parent first. 471 if d.parent != nil { 472 d.parent.inode.watches.Notify(ctx, d.name, events, cookie, et, deleted) 473 } 474 d.inode.watches.Notify(ctx, "", events, cookie, et, deleted) 475 d.inode.fs.mu.RUnlock() 476 } 477 478 // Watches implements vfs.DentryImpl.Watches. 479 func (d *dentry) Watches() *vfs.Watches { 480 return &d.inode.watches 481 } 482 483 // OnZeroWatches implements vfs.Dentry.OnZeroWatches. 484 func (d *dentry) OnZeroWatches(context.Context) {} 485 486 // inode represents a filesystem object. 487 // 488 // +stateify savable 489 type inode struct { 490 // fs is the owning filesystem. fs is immutable. 491 fs *filesystem 492 493 // A reference is held on all inodes as long as they are reachable in the 494 // filesystem tree, i.e. nlink is nonzero. This reference is dropped when 495 // nlink reaches 0. 496 refs inodeRefs 497 498 // xattrs implements extended attributes. 499 // 500 // TODO(b/148380782): Support xattrs other than user.* 501 xattrs memxattr.SimpleExtendedAttributes 502 503 // Inode metadata. Writing multiple fields atomically requires holding 504 // mu, othewise atomic operations can be used. 505 mu inodeMutex `state:"nosave"` 506 mode atomicbitops.Uint32 // file type and mode 507 nlink atomicbitops.Uint32 // protected by filesystem.mu instead of inode.mu 508 uid atomicbitops.Uint32 // auth.KUID, but stored as raw uint32 for sync/atomic 509 gid atomicbitops.Uint32 // auth.KGID, but ... 510 ino uint64 // immutable 511 512 // Linux's tmpfs has no concept of btime. 513 atime atomicbitops.Int64 // nanoseconds 514 ctime atomicbitops.Int64 // nanoseconds 515 mtime atomicbitops.Int64 // nanoseconds 516 517 locks vfs.FileLocks 518 519 // Inotify watches for this inode. 520 watches vfs.Watches 521 522 impl any // immutable 523 } 524 525 const maxLinks = math.MaxUint32 526 527 func (i *inode) init(impl any, fs *filesystem, kuid auth.KUID, kgid auth.KGID, mode linux.FileMode, parentDir *directory) { 528 if mode.FileType() == 0 { 529 panic("file type is required in FileMode") 530 } 531 532 // Inherit the group and setgid bit as in fs/inode.c:inode_init_owner(). 533 if parentDir != nil && parentDir.inode.mode.Load()&linux.S_ISGID == linux.S_ISGID { 534 kgid = auth.KGID(parentDir.inode.gid.Load()) 535 if mode&linux.S_IFDIR == linux.S_IFDIR { 536 mode |= linux.S_ISGID 537 } 538 } 539 540 i.fs = fs 541 i.mode = atomicbitops.FromUint32(uint32(mode)) 542 i.uid = atomicbitops.FromUint32(uint32(kuid)) 543 i.gid = atomicbitops.FromUint32(uint32(kgid)) 544 i.ino = fs.nextInoMinusOne.Add(1) 545 // Tmpfs creation sets atime, ctime, and mtime to current time. 546 now := fs.clock.Now().Nanoseconds() 547 i.atime = atomicbitops.FromInt64(now) 548 i.ctime = atomicbitops.FromInt64(now) 549 i.mtime = atomicbitops.FromInt64(now) 550 // i.nlink initialized by caller 551 i.impl = impl 552 i.refs.InitRefs() 553 } 554 555 // incLinksLocked increments i's link count. 556 // 557 // Preconditions: 558 // - filesystem.mu must be locked for writing. 559 // - i.mu must be lcoked. 560 // - i.nlink != 0. 561 // - i.nlink < maxLinks. 562 func (i *inode) incLinksLocked() { 563 if i.nlink.RacyLoad() == 0 { 564 panic("tmpfs.inode.incLinksLocked() called with no existing links") 565 } 566 if i.nlink.RacyLoad() == maxLinks { 567 panic("tmpfs.inode.incLinksLocked() called with maximum link count") 568 } 569 i.nlink.Add(1) 570 } 571 572 // decLinksLocked decrements i's link count. If the link count reaches 0, we 573 // remove a reference on i as well. 574 // 575 // Preconditions: 576 // - filesystem.mu must be locked for writing. 577 // - i.mu must be lcoked. 578 // - i.nlink != 0. 579 func (i *inode) decLinksLocked(ctx context.Context) { 580 if i.nlink.RacyLoad() == 0 { 581 panic("tmpfs.inode.decLinksLocked() called with no existing links") 582 } 583 if i.nlink.Add(^uint32(0)) == 0 { 584 i.decRef(ctx) 585 } 586 } 587 588 func (i *inode) incRef() { 589 i.refs.IncRef() 590 } 591 592 func (i *inode) tryIncRef() bool { 593 return i.refs.TryIncRef() 594 } 595 596 func (i *inode) decRef(ctx context.Context) { 597 i.refs.DecRef(func() { 598 i.watches.HandleDeletion(ctx) 599 // Remove pages used if child being removed is a SymLink or Regular File. 600 switch impl := i.impl.(type) { 601 case *symlink: 602 if len(impl.target) >= shortSymlinkLen { 603 impl.inode.fs.unaccountPages(1) 604 } 605 case *regularFile: 606 // Release memory used by regFile to store data. Since regFile is 607 // no longer usable, we don't need to grab any locks or update any 608 // metadata. 609 pagesDec := impl.data.DropAll(i.fs.mf) 610 impl.inode.fs.unaccountPages(pagesDec) 611 } 612 613 }) 614 } 615 616 func (i *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error { 617 mode := linux.FileMode(i.mode.Load()) 618 return vfs.GenericCheckPermissions(creds, ats, mode, auth.KUID(i.uid.Load()), auth.KGID(i.gid.Load())) 619 } 620 621 // Go won't inline this function, and returning linux.Statx (which is quite 622 // big) means spending a lot of time in runtime.duffcopy(), so instead it's an 623 // output parameter. 624 // 625 // Note that Linux does not guarantee to return consistent data (in the case of 626 // a concurrent modification), so we do not require holding inode.mu. 627 func (i *inode) statTo(stat *linux.Statx) { 628 stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | 629 linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_SIZE | 630 linux.STATX_BLOCKS | linux.STATX_ATIME | linux.STATX_CTIME | 631 linux.STATX_MTIME 632 stat.Blksize = hostarch.PageSize 633 stat.Nlink = i.nlink.Load() 634 stat.UID = i.uid.Load() 635 stat.GID = i.gid.Load() 636 stat.Mode = uint16(i.mode.Load()) 637 stat.Ino = i.ino 638 stat.Atime = linux.NsecToStatxTimestamp(i.atime.Load()) 639 stat.Ctime = linux.NsecToStatxTimestamp(i.ctime.Load()) 640 stat.Mtime = linux.NsecToStatxTimestamp(i.mtime.Load()) 641 stat.DevMajor = linux.UNNAMED_MAJOR 642 stat.DevMinor = i.fs.devMinor 643 switch impl := i.impl.(type) { 644 case *regularFile: 645 stat.Size = uint64(impl.size.Load()) 646 // TODO(jamieliu): This should be impl.data.Span() / 512, but this is 647 // too expensive to compute here. Cache it in regularFile. 648 stat.Blocks = allocatedBlocksForSize(stat.Size) 649 case *directory: 650 stat.Size = direntSize * (2 + uint64(impl.numChildren.Load())) 651 // stat.Blocks is 0. 652 case *symlink: 653 stat.Size = uint64(len(impl.target)) 654 // stat.Blocks is 0. 655 case *namedPipe, *socketFile: 656 // stat.Size and stat.Blocks are 0. 657 case *deviceFile: 658 // stat.Size and stat.Blocks are 0. 659 stat.RdevMajor = impl.major 660 stat.RdevMinor = impl.minor 661 default: 662 panic(fmt.Sprintf("unknown inode type: %T", i.impl)) 663 } 664 } 665 666 func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs.SetStatOptions) error { 667 stat := &opts.Stat 668 if stat.Mask == 0 { 669 return nil 670 } 671 if stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_CTIME|linux.STATX_SIZE) != 0 { 672 return linuxerr.EPERM 673 } 674 mode := linux.FileMode(i.mode.Load()) 675 if err := vfs.CheckSetStat(ctx, creds, opts, mode, auth.KUID(i.uid.Load()), auth.KGID(i.gid.Load())); err != nil { 676 return err 677 } 678 679 i.mu.Lock() 680 defer i.mu.Unlock() 681 var ( 682 needsMtimeBump bool 683 needsCtimeBump bool 684 ) 685 clearSID := false 686 mask := stat.Mask 687 if mask&linux.STATX_SIZE != 0 { 688 switch impl := i.impl.(type) { 689 case *regularFile: 690 updated, err := impl.truncateLocked(stat.Size) 691 if err != nil { 692 return err 693 } 694 if updated { 695 clearSID = true 696 needsMtimeBump = true 697 needsCtimeBump = true 698 } 699 case *directory: 700 return linuxerr.EISDIR 701 default: 702 return linuxerr.EINVAL 703 } 704 } 705 if mask&linux.STATX_UID != 0 { 706 i.uid.Store(stat.UID) 707 needsCtimeBump = true 708 clearSID = true 709 } 710 if mask&linux.STATX_GID != 0 { 711 i.gid.Store(stat.GID) 712 needsCtimeBump = true 713 clearSID = true 714 } 715 if mask&linux.STATX_MODE != 0 { 716 for { 717 old := i.mode.Load() 718 ft := old & linux.S_IFMT 719 newMode := ft | uint32(stat.Mode & ^uint16(linux.S_IFMT)) 720 if clearSID { 721 newMode = vfs.ClearSUIDAndSGID(newMode) 722 } 723 if swapped := i.mode.CompareAndSwap(old, newMode); swapped { 724 clearSID = false 725 break 726 } 727 } 728 needsCtimeBump = true 729 } 730 now := i.fs.clock.Now().Nanoseconds() 731 if mask&linux.STATX_ATIME != 0 { 732 if stat.Atime.Nsec == linux.UTIME_NOW { 733 i.atime.Store(now) 734 } else { 735 i.atime.Store(stat.Atime.ToNsecCapped()) 736 } 737 needsCtimeBump = true 738 } 739 if mask&linux.STATX_MTIME != 0 { 740 if stat.Mtime.Nsec == linux.UTIME_NOW { 741 i.mtime.Store(now) 742 } else { 743 i.mtime.Store(stat.Mtime.ToNsecCapped()) 744 } 745 needsCtimeBump = true 746 // Ignore the mtime bump, since we just set it ourselves. 747 needsMtimeBump = false 748 } 749 if mask&linux.STATX_CTIME != 0 { 750 if stat.Ctime.Nsec == linux.UTIME_NOW { 751 i.ctime.Store(now) 752 } else { 753 i.ctime.Store(stat.Ctime.ToNsecCapped()) 754 } 755 // Ignore the ctime bump, since we just set it ourselves. 756 needsCtimeBump = false 757 } 758 759 // We may have to clear the SUID/SGID bits, but didn't do so as part of 760 // STATX_MODE. 761 if clearSID { 762 for { 763 old := i.mode.Load() 764 newMode := vfs.ClearSUIDAndSGID(old) 765 if swapped := i.mode.CompareAndSwap(old, newMode); swapped { 766 break 767 } 768 } 769 needsCtimeBump = true 770 } 771 772 if needsMtimeBump { 773 i.mtime.Store(now) 774 } 775 if needsCtimeBump { 776 i.ctime.Store(now) 777 } 778 779 return nil 780 } 781 782 // allocatedBlocksForSize returns the number of 512B blocks needed to 783 // accommodate the given size in bytes, as appropriate for struct 784 // stat::st_blocks and struct statx::stx_blocks. (Note that this 512B block 785 // size is independent of the "preferred block size for I/O", struct 786 // stat::st_blksize and struct statx::stx_blksize.) 787 func allocatedBlocksForSize(size uint64) uint64 { 788 return (size + 511) / 512 789 } 790 791 func (i *inode) direntType() uint8 { 792 switch impl := i.impl.(type) { 793 case *regularFile: 794 return linux.DT_REG 795 case *directory: 796 return linux.DT_DIR 797 case *symlink: 798 return linux.DT_LNK 799 case *socketFile: 800 return linux.DT_SOCK 801 case *namedPipe: 802 return linux.DT_FIFO 803 case *deviceFile: 804 switch impl.kind { 805 case vfs.BlockDevice: 806 return linux.DT_BLK 807 case vfs.CharDevice: 808 return linux.DT_CHR 809 default: 810 panic(fmt.Sprintf("unknown vfs.DeviceKind: %v", impl.kind)) 811 } 812 default: 813 panic(fmt.Sprintf("unknown inode type: %T", i.impl)) 814 } 815 } 816 817 func (i *inode) isDir() bool { 818 mode := linux.FileMode(i.mode.Load()) 819 return mode.FileType() == linux.S_IFDIR 820 } 821 822 func (i *inode) touchAtime(mnt *vfs.Mount) { 823 if mnt.Flags.NoATime { 824 return 825 } 826 if err := mnt.CheckBeginWrite(); err != nil { 827 return 828 } 829 now := i.fs.clock.Now().Nanoseconds() 830 i.mu.Lock() 831 i.atime.Store(now) 832 i.mu.Unlock() 833 mnt.EndWrite() 834 } 835 836 // Preconditions: The caller has called vfs.Mount.CheckBeginWrite(). 837 func (i *inode) touchCtime() { 838 now := i.fs.clock.Now().Nanoseconds() 839 i.mu.Lock() 840 i.ctime.Store(now) 841 i.mu.Unlock() 842 } 843 844 // Preconditions: The caller has called vfs.Mount.CheckBeginWrite(). 845 func (i *inode) touchCMtime() { 846 now := i.fs.clock.Now().Nanoseconds() 847 i.mu.Lock() 848 i.mtime.Store(now) 849 i.ctime.Store(now) 850 i.mu.Unlock() 851 } 852 853 // Preconditions: 854 // - The caller has called vfs.Mount.CheckBeginWrite(). 855 // - inode.mu must be locked. 856 func (i *inode) touchCMtimeLocked() { 857 now := i.fs.clock.Now().Nanoseconds() 858 i.mtime.Store(now) 859 i.ctime.Store(now) 860 } 861 862 func (i *inode) checkXattrPrefix(name string) error { 863 for prefix := range i.fs.allowXattrPrefix { 864 if strings.HasPrefix(name, prefix) { 865 return nil 866 } 867 } 868 return linuxerr.EOPNOTSUPP 869 } 870 871 func (i *inode) listXattr(creds *auth.Credentials, size uint64) ([]string, error) { 872 return i.xattrs.ListXattr(creds, size) 873 } 874 875 func (i *inode) getXattr(creds *auth.Credentials, opts *vfs.GetXattrOptions) (string, error) { 876 if err := i.checkXattrPrefix(opts.Name); err != nil { 877 return "", err 878 } 879 mode := linux.FileMode(i.mode.Load()) 880 kuid := auth.KUID(i.uid.Load()) 881 kgid := auth.KGID(i.gid.Load()) 882 if err := vfs.GenericCheckPermissions(creds, vfs.MayRead, mode, kuid, kgid); err != nil { 883 return "", err 884 } 885 return i.xattrs.GetXattr(creds, mode, kuid, opts) 886 } 887 888 func (i *inode) setXattr(creds *auth.Credentials, opts *vfs.SetXattrOptions) error { 889 if err := i.checkXattrPrefix(opts.Name); err != nil { 890 return err 891 } 892 mode := linux.FileMode(i.mode.Load()) 893 kuid := auth.KUID(i.uid.Load()) 894 kgid := auth.KGID(i.gid.Load()) 895 if err := vfs.GenericCheckPermissions(creds, vfs.MayWrite, mode, kuid, kgid); err != nil { 896 return err 897 } 898 return i.xattrs.SetXattr(creds, mode, kuid, opts) 899 } 900 901 func (i *inode) removeXattr(creds *auth.Credentials, name string) error { 902 if err := i.checkXattrPrefix(name); err != nil { 903 return err 904 } 905 mode := linux.FileMode(i.mode.Load()) 906 kuid := auth.KUID(i.uid.Load()) 907 kgid := auth.KGID(i.gid.Load()) 908 if err := vfs.GenericCheckPermissions(creds, vfs.MayWrite, mode, kuid, kgid); err != nil { 909 return err 910 } 911 return i.xattrs.RemoveXattr(creds, mode, kuid, name) 912 } 913 914 // fileDescription is embedded by tmpfs implementations of 915 // vfs.FileDescriptionImpl. 916 // 917 // +stateify savable 918 type fileDescription struct { 919 vfsfd vfs.FileDescription 920 vfs.FileDescriptionDefaultImpl 921 vfs.LockFD 922 } 923 924 func (fd *fileDescription) filesystem() *filesystem { 925 return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem) 926 } 927 928 func (fd *fileDescription) dentry() *dentry { 929 return fd.vfsfd.Dentry().Impl().(*dentry) 930 } 931 932 func (fd *fileDescription) inode() *inode { 933 return fd.dentry().inode 934 } 935 936 // Stat implements vfs.FileDescriptionImpl.Stat. 937 func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { 938 var stat linux.Statx 939 fd.inode().statTo(&stat) 940 return stat, nil 941 } 942 943 // SetStat implements vfs.FileDescriptionImpl.SetStat. 944 func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { 945 return fd.dentry().inode.setStat(ctx, auth.CredentialsFromContext(ctx), &opts) 946 } 947 948 // StatFS implements vfs.FileDescriptionImpl.StatFS. 949 func (fd *fileDescription) StatFS(ctx context.Context) (linux.Statfs, error) { 950 return fd.filesystem().statFS(), nil 951 } 952 953 // ListXattr implements vfs.FileDescriptionImpl.ListXattr. 954 func (fd *fileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) { 955 return fd.inode().listXattr(auth.CredentialsFromContext(ctx), size) 956 } 957 958 // GetXattr implements vfs.FileDescriptionImpl.GetXattr. 959 func (fd *fileDescription) GetXattr(ctx context.Context, opts vfs.GetXattrOptions) (string, error) { 960 return fd.inode().getXattr(auth.CredentialsFromContext(ctx), &opts) 961 } 962 963 // SetXattr implements vfs.FileDescriptionImpl.SetXattr. 964 func (fd *fileDescription) SetXattr(ctx context.Context, opts vfs.SetXattrOptions) error { 965 return fd.dentry().inode.setXattr(auth.CredentialsFromContext(ctx), &opts) 966 } 967 968 // RemoveXattr implements vfs.FileDescriptionImpl.RemoveXattr. 969 func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error { 970 return fd.dentry().inode.removeXattr(auth.CredentialsFromContext(ctx), name) 971 } 972 973 // Sync implements vfs.FileDescriptionImpl.Sync. It does nothing because all 974 // filesystem state is in-memory. 975 func (*fileDescription) Sync(context.Context) error { 976 return nil 977 } 978 979 // parseSize converts size in string to an integer bytes. 980 // Supported suffixes in string are:K, M, G, T, P, E. 981 func parseSize(s string) (uint64, error) { 982 if len(s) == 0 { 983 return 0, fmt.Errorf("size parameter empty") 984 } 985 suffix := s[len(s)-1] 986 count := 1 987 switch suffix { 988 case 'e', 'E': 989 count = count << 10 990 fallthrough 991 case 'p', 'P': 992 count = count << 10 993 fallthrough 994 case 't', 'T': 995 count = count << 10 996 fallthrough 997 case 'g', 'G': 998 count = count << 10 999 fallthrough 1000 case 'm', 'M': 1001 count = count << 10 1002 fallthrough 1003 case 'k', 'K': 1004 count = count << 10 1005 s = s[:len(s)-1] 1006 } 1007 byteTmp, err := strconv.ParseUint(s, 10, 64) 1008 if err != nil { 1009 return 0, linuxerr.EINVAL 1010 } 1011 // Check for overflow. 1012 bytes := byteTmp * uint64(count) 1013 if byteTmp != 0 && bytes/byteTmp != uint64(count) { 1014 return 0, fmt.Errorf("size overflow") 1015 } 1016 return bytes, err 1017 }