github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/fsimpl/tmpfs/tmpfs.go (about) 1 // Copyright 2019 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package tmpfs provides an in-memory filesystem whose contents are 16 // application-mutable, consistent with Linux's tmpfs. 17 // 18 // Lock order: 19 // 20 // filesystem.mu 21 // inode.mu 22 // regularFileFD.offMu 23 // *** "memmap.Mappable locks" below this point 24 // regularFile.mapsMu 25 // *** "memmap.Mappable locks taken by Translate" below this point 26 // regularFile.dataMu 27 // fs.pagesUsedMu 28 // directory.iterMu 29 package tmpfs 30 31 import ( 32 "fmt" 33 "math" 34 "strconv" 35 "strings" 36 "sync/atomic" 37 38 "github.com/metacubex/gvisor/pkg/abi/linux" 39 "github.com/metacubex/gvisor/pkg/atomicbitops" 40 "github.com/metacubex/gvisor/pkg/context" 41 "github.com/metacubex/gvisor/pkg/errors/linuxerr" 42 "github.com/metacubex/gvisor/pkg/hostarch" 43 "github.com/metacubex/gvisor/pkg/sentry/kernel/auth" 44 "github.com/metacubex/gvisor/pkg/sentry/kernel/time" 45 "github.com/metacubex/gvisor/pkg/sentry/pgalloc" 46 "github.com/metacubex/gvisor/pkg/sentry/usage" 47 "github.com/metacubex/gvisor/pkg/sentry/vfs" 48 "github.com/metacubex/gvisor/pkg/sentry/vfs/memxattr" 49 ) 50 51 // Name is the default filesystem name. 52 const Name = "tmpfs" 53 54 // FilesystemType implements vfs.FilesystemType. 55 // 56 // +stateify savable 57 type FilesystemType struct{} 58 59 // filesystem implements vfs.FilesystemImpl. 60 // 61 // +stateify savable 62 type filesystem struct { 63 vfsfs vfs.Filesystem 64 65 // mf is used to allocate memory that stores regular file contents. mf is 66 // immutable, except it is changed during restore. 67 mf *pgalloc.MemoryFile `state:".(string)"` 68 69 // clock is a realtime clock used to set timestamps in file operations. 70 clock time.Clock 71 72 // devMinor is the filesystem's minor device number. devMinor is immutable. 73 devMinor uint32 74 75 // mopts contains the tmpfs-specific mount options passed to this 76 // filesystem. Immutable. 77 mopts string 78 79 // usage is the memory accounting category under which pages backing 80 // files in this filesystem are accounted. 81 usage usage.MemoryKind 82 83 // mu serializes changes to the Dentry tree. 84 mu filesystemRWMutex `state:"nosave"` 85 86 nextInoMinusOne atomicbitops.Uint64 // accessed using atomic memory operations 87 88 root *dentry 89 90 maxFilenameLen int 91 92 // maxSizeInPages is the maximum permissible size for the tmpfs in terms of pages. 93 // This field is immutable. 94 maxSizeInPages uint64 95 96 // pagesUsed is the number of pages used by this filesystem. 97 pagesUsed atomicbitops.Uint64 98 99 // allowXattrPrefix is a set of xattr namespace prefixes that this 100 // tmpfs mount will allow. It is immutable. 101 allowXattrPrefix map[string]struct{} 102 } 103 104 // Name implements vfs.FilesystemType.Name. 105 func (FilesystemType) Name() string { 106 return Name 107 } 108 109 // Release implements vfs.FilesystemType.Release. 110 func (FilesystemType) Release(ctx context.Context) {} 111 112 // FilesystemOpts is used to pass configuration data to tmpfs. 113 // 114 // +stateify savable 115 type FilesystemOpts struct { 116 // RootFileType is the FileType of the filesystem root. Valid values 117 // are: S_IFDIR, S_IFREG, and S_IFLNK. Defaults to S_IFDIR. 118 RootFileType uint16 119 120 // RootSymlinkTarget is the target of the root symlink. Only valid if 121 // RootFileType == S_IFLNK. 122 RootSymlinkTarget string 123 124 // FilesystemType allows setting a different FilesystemType for this 125 // tmpfs filesystem. This allows tmpfs to "impersonate" other 126 // filesystems, like ramdiskfs and cgroupfs. 127 FilesystemType vfs.FilesystemType 128 129 // Usage is the memory accounting category under which pages backing files in 130 // the filesystem are accounted. 131 Usage *usage.MemoryKind 132 133 // MaxFilenameLen is the maximum filename length allowed by the tmpfs. 134 MaxFilenameLen int 135 136 // MemoryFile is the memory file that will be used to store file data. If 137 // this is nil, then MemoryFileFromContext() is used. 138 MemoryFile *pgalloc.MemoryFile 139 140 // DisableDefaultSizeLimit disables setting a default size limit. In Linux, 141 // SB_KERNMOUNT has this effect on tmpfs mounts; see mm/shmem.c:shmem_fill_super(). 142 DisableDefaultSizeLimit bool 143 144 // AllowXattrPrefix is a set of xattr namespace prefixes that this 145 // tmpfs mount will allow. 146 AllowXattrPrefix []string 147 } 148 149 // Default size limit mount option. It is immutable after initialization. 150 var defaultSizeLimit uint64 151 152 // SetDefaultSizeLimit configures the size limit to be used for tmpfs mounts 153 // that do not specify a size= mount option. This must be called only once, 154 // before any tmpfs filesystems are created. 155 func SetDefaultSizeLimit(sizeLimit uint64) { 156 defaultSizeLimit = sizeLimit 157 } 158 159 func getDefaultSizeLimit(disable bool) uint64 { 160 if disable || defaultSizeLimit == 0 { 161 // The size limit is used to populate statfs(2) results. If Linux tmpfs is 162 // mounted with no size option, then statfs(2) returns f_blocks == f_bfree 163 // == f_bavail == 0. However, many applications treat this as having a size 164 // limit of 0. To work around this, return a very large but non-zero size 165 // limit, chosen to ensure that it does not overflow int64. 166 return math.MaxInt64 167 } 168 return defaultSizeLimit 169 } 170 171 // GetFilesystem implements vfs.FilesystemType.GetFilesystem. 172 func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, _ string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { 173 mf := pgalloc.MemoryFileFromContext(ctx) 174 if mf == nil { 175 panic("CtxMemoryFile returned nil") 176 } 177 rootFileType := uint16(linux.S_IFDIR) 178 disableDefaultSizeLimit := false 179 newFSType := vfs.FilesystemType(&fstype) 180 181 // By default we support only "trusted" and "user" namespaces. Linux 182 // also supports "security" and (if configured) POSIX ACL namespaces 183 // "system.posix_acl_access" and "system.posix_acl_default". 184 allowXattrPrefix := map[string]struct{}{ 185 linux.XATTR_TRUSTED_PREFIX: struct{}{}, 186 linux.XATTR_USER_PREFIX: struct{}{}, 187 // The "security" namespace is allowed, but it always returns an error. 188 linux.XATTR_SECURITY_PREFIX: struct{}{}, 189 } 190 191 tmpfsOpts, tmpfsOptsOk := opts.InternalData.(FilesystemOpts) 192 if tmpfsOptsOk { 193 if tmpfsOpts.RootFileType != 0 { 194 rootFileType = tmpfsOpts.RootFileType 195 } 196 if tmpfsOpts.FilesystemType != nil { 197 newFSType = tmpfsOpts.FilesystemType 198 } 199 disableDefaultSizeLimit = tmpfsOpts.DisableDefaultSizeLimit 200 if tmpfsOpts.MemoryFile != nil { 201 mf = tmpfsOpts.MemoryFile 202 } 203 for _, xattr := range tmpfsOpts.AllowXattrPrefix { 204 allowXattrPrefix[xattr] = struct{}{} 205 } 206 } 207 208 mopts := vfs.GenericParseMountOptions(opts.Data) 209 rootMode := linux.FileMode(0777) 210 if rootFileType == linux.S_IFDIR { 211 rootMode = 01777 212 } 213 modeStr, ok := mopts["mode"] 214 if ok { 215 delete(mopts, "mode") 216 mode, err := strconv.ParseUint(modeStr, 8, 32) 217 if err != nil { 218 ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid mode: %q", modeStr) 219 return nil, nil, linuxerr.EINVAL 220 } 221 rootMode = linux.FileMode(mode & 07777) 222 } 223 rootKUID := creds.EffectiveKUID 224 uidStr, ok := mopts["uid"] 225 if ok { 226 delete(mopts, "uid") 227 uid, err := strconv.ParseUint(uidStr, 10, 32) 228 if err != nil { 229 ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid uid: %q", uidStr) 230 return nil, nil, linuxerr.EINVAL 231 } 232 kuid := creds.UserNamespace.MapToKUID(auth.UID(uid)) 233 if !kuid.Ok() { 234 ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unmapped uid: %d", uid) 235 return nil, nil, linuxerr.EINVAL 236 } 237 rootKUID = kuid 238 } 239 rootKGID := creds.EffectiveKGID 240 gidStr, ok := mopts["gid"] 241 if ok { 242 delete(mopts, "gid") 243 gid, err := strconv.ParseUint(gidStr, 10, 32) 244 if err != nil { 245 ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid gid: %q", gidStr) 246 return nil, nil, linuxerr.EINVAL 247 } 248 kgid := creds.UserNamespace.MapToKGID(auth.GID(gid)) 249 if !kgid.Ok() { 250 ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unmapped gid: %d", gid) 251 return nil, nil, linuxerr.EINVAL 252 } 253 rootKGID = kgid 254 } 255 maxSizeInPages := getDefaultSizeLimit(disableDefaultSizeLimit) / hostarch.PageSize 256 maxSizeStr, ok := mopts["size"] 257 if ok { 258 delete(mopts, "size") 259 maxSizeInBytes, err := parseSize(maxSizeStr) 260 if err != nil { 261 ctx.Debugf("tmpfs.FilesystemType.GetFilesystem: parseSize() failed: %v", err) 262 return nil, nil, linuxerr.EINVAL 263 } 264 // Convert size in bytes to nearest Page Size bytes 265 // as Linux allocates memory in terms of Page size. 266 maxSizeInPages, ok = hostarch.ToPagesRoundUp(maxSizeInBytes) 267 if !ok { 268 ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: Pages RoundUp Overflow error: %q", ok) 269 return nil, nil, linuxerr.EINVAL 270 } 271 } 272 273 if len(mopts) != 0 { 274 ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unknown options: %v", mopts) 275 return nil, nil, linuxerr.EINVAL 276 } 277 278 devMinor, err := vfsObj.GetAnonBlockDevMinor() 279 if err != nil { 280 return nil, nil, err 281 } 282 clock := time.RealtimeClockFromContext(ctx) 283 memUsage := usage.Tmpfs 284 if tmpfsOpts.Usage != nil { 285 memUsage = *tmpfsOpts.Usage 286 } 287 fs := filesystem{ 288 mf: mf, 289 clock: clock, 290 devMinor: devMinor, 291 mopts: opts.Data, 292 usage: memUsage, 293 maxFilenameLen: linux.NAME_MAX, 294 maxSizeInPages: maxSizeInPages, 295 allowXattrPrefix: allowXattrPrefix, 296 } 297 fs.vfsfs.Init(vfsObj, newFSType, &fs) 298 if tmpfsOptsOk && tmpfsOpts.MaxFilenameLen > 0 { 299 fs.maxFilenameLen = tmpfsOpts.MaxFilenameLen 300 } 301 302 var root *dentry 303 switch rootFileType { 304 case linux.S_IFREG: 305 root = fs.newDentry(fs.newRegularFile(rootKUID, rootKGID, rootMode, nil /* parentDir */)) 306 case linux.S_IFLNK: 307 root = fs.newDentry(fs.newSymlink(rootKUID, rootKGID, rootMode, tmpfsOpts.RootSymlinkTarget, nil /* parentDir */)) 308 case linux.S_IFDIR: 309 root = &fs.newDirectory(rootKUID, rootKGID, rootMode, nil /* parentDir */).dentry 310 default: 311 fs.vfsfs.DecRef(ctx) 312 return nil, nil, fmt.Errorf("invalid tmpfs root file type: %#o", rootFileType) 313 } 314 fs.root = root 315 return &fs.vfsfs, &root.vfsd, nil 316 } 317 318 // Release implements vfs.FilesystemImpl.Release. 319 func (fs *filesystem) Release(ctx context.Context) { 320 fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) 321 fs.mu.Lock() 322 if fs.root.inode.isDir() { 323 fs.root.releaseChildrenLocked(ctx) 324 } 325 fs.mu.Unlock() 326 if fs.mf.RestoreID() != "" { 327 // If RestoreID is set, then this is a private MemoryFile which needs to be 328 // destroyed since this tmpfs is the only user. 329 fs.mf.Destroy() 330 } 331 } 332 333 // releaseChildrenLocked is called on the mount point by filesystem.Release() to 334 // destroy all objects in the mount. It performs a depth-first walk of the 335 // filesystem and "unlinks" everything by decrementing link counts 336 // appropriately. There should be no open file descriptors when this is called, 337 // so each inode should only have one outstanding reference that is removed once 338 // its link count hits zero. 339 // 340 // Note that we do not update filesystem state precisely while tearing down (for 341 // instance, the child maps are ignored)--we only care to remove all remaining 342 // references so that every filesystem object gets destroyed. Also note that we 343 // do not need to trigger DecRef on the mount point itself or any child mount; 344 // these are taken care of by the destructor of the enclosing MountNamespace. 345 // 346 // Precondition: filesystem.mu is held. 347 func (d *dentry) releaseChildrenLocked(ctx context.Context) { 348 dir := d.inode.impl.(*directory) 349 for _, child := range dir.childMap { 350 if child.inode.isDir() { 351 child.releaseChildrenLocked(ctx) 352 child.inode.decLinksLocked(ctx) // link for child/. 353 dir.inode.decLinksLocked(ctx) // link for child/.. 354 } 355 child.inode.decLinksLocked(ctx) // link for child 356 } 357 } 358 359 func (fs *filesystem) statFS() linux.Statfs { 360 st := linux.Statfs{ 361 Type: linux.TMPFS_MAGIC, 362 BlockSize: hostarch.PageSize, 363 FragmentSize: hostarch.PageSize, 364 NameLength: linux.NAME_MAX, 365 } 366 367 // If size is set for tmpfs return set values. 368 st.Blocks = fs.maxSizeInPages 369 pagesUsed := fs.pagesUsed.Load() 370 st.BlocksFree = fs.maxSizeInPages - pagesUsed 371 st.BlocksAvailable = fs.maxSizeInPages - pagesUsed 372 return st 373 } 374 375 // dentry implements vfs.DentryImpl. 376 // 377 // +stateify savable 378 type dentry struct { 379 vfsd vfs.Dentry 380 381 // parent is this dentry's parent directory. Each referenced dentry holds a 382 // reference on parent.dentry. If this dentry is a filesystem root, parent 383 // is nil. parent is protected by filesystem.mu. 384 parent atomic.Pointer[dentry] `state:".(*dentry)"` 385 386 // name is the name of this dentry in its parent. If this dentry is a 387 // filesystem root, name is the empty string. name is protected by 388 // filesystem.mu. 389 name string 390 391 // dentryEntry (ugh) links dentries into their parent directory.childList. 392 dentryEntry 393 394 // inode is the inode represented by this dentry. Multiple Dentries may 395 // share a single non-directory inode (with hard links). inode is 396 // immutable. 397 // 398 // tmpfs doesn't count references on dentries; because the dentry tree is 399 // the sole source of truth, it is by definition always consistent with the 400 // state of the filesystem. However, it does count references on inodes, 401 // because inode resources are released when all references are dropped. 402 // dentry therefore forwards reference counting directly to inode. 403 inode *inode 404 } 405 406 func (fs *filesystem) newDentry(inode *inode) *dentry { 407 d := &dentry{ 408 inode: inode, 409 } 410 d.vfsd.Init(d) 411 return d 412 } 413 414 // IncRef implements vfs.DentryImpl.IncRef. 415 func (d *dentry) IncRef() { 416 d.inode.incRef() 417 } 418 419 // TryIncRef implements vfs.DentryImpl.TryIncRef. 420 func (d *dentry) TryIncRef() bool { 421 return d.inode.tryIncRef() 422 } 423 424 // DecRef implements vfs.DentryImpl.DecRef. 425 func (d *dentry) DecRef(ctx context.Context) { 426 d.inode.decRef(ctx) 427 } 428 429 // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. 430 func (d *dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) { 431 if d.inode.isDir() { 432 events |= linux.IN_ISDIR 433 } 434 435 // tmpfs never calls VFS.InvalidateDentry(), so d.vfsd.IsDead() indicates 436 // that d was deleted. 437 deleted := d.vfsd.IsDead() 438 439 d.inode.fs.mu.RLock() 440 // The ordering below is important, Linux always notifies the parent first. 441 parent := d.parent.Load() 442 if parent != nil { 443 parent.inode.watches.Notify(ctx, d.name, events, cookie, et, deleted) 444 } 445 d.inode.watches.Notify(ctx, "", events, cookie, et, deleted) 446 d.inode.fs.mu.RUnlock() 447 } 448 449 // Watches implements vfs.DentryImpl.Watches. 450 func (d *dentry) Watches() *vfs.Watches { 451 return &d.inode.watches 452 } 453 454 // OnZeroWatches implements vfs.Dentry.OnZeroWatches. 455 func (d *dentry) OnZeroWatches(context.Context) {} 456 457 // inode represents a filesystem object. 458 // 459 // +stateify savable 460 type inode struct { 461 // fs is the owning filesystem. fs is immutable. 462 fs *filesystem 463 464 // A reference is held on all inodes as long as they are reachable in the 465 // filesystem tree, i.e. nlink is nonzero. This reference is dropped when 466 // nlink reaches 0. 467 refs inodeRefs 468 469 // xattrs implements extended attributes. 470 // 471 // TODO(b/148380782): Support xattrs other than user.* 472 xattrs memxattr.SimpleExtendedAttributes 473 474 // Inode metadata. Writing multiple fields atomically requires holding 475 // mu, otherwise atomic operations can be used. 476 mu inodeMutex `state:"nosave"` 477 mode atomicbitops.Uint32 // file type and mode 478 nlink atomicbitops.Uint32 // protected by filesystem.mu instead of inode.mu 479 uid atomicbitops.Uint32 // auth.KUID, but stored as raw uint32 for sync/atomic 480 gid atomicbitops.Uint32 // auth.KGID, but ... 481 ino uint64 // immutable 482 483 // Linux's tmpfs has no concept of btime. 484 atime atomicbitops.Int64 // nanoseconds 485 ctime atomicbitops.Int64 // nanoseconds 486 mtime atomicbitops.Int64 // nanoseconds 487 488 locks vfs.FileLocks 489 490 // Inotify watches for this inode. 491 watches vfs.Watches 492 493 impl any // immutable 494 } 495 496 const maxLinks = math.MaxUint32 497 498 func (i *inode) init(impl any, fs *filesystem, kuid auth.KUID, kgid auth.KGID, mode linux.FileMode, parentDir *directory) { 499 if mode.FileType() == 0 { 500 panic("file type is required in FileMode") 501 } 502 503 // Inherit the group and setgid bit as in fs/inode.c:inode_init_owner(). 504 if parentDir != nil && parentDir.inode.mode.Load()&linux.S_ISGID == linux.S_ISGID { 505 kgid = auth.KGID(parentDir.inode.gid.Load()) 506 if mode&linux.S_IFDIR == linux.S_IFDIR { 507 mode |= linux.S_ISGID 508 } 509 } 510 511 i.fs = fs 512 i.mode = atomicbitops.FromUint32(uint32(mode)) 513 i.uid = atomicbitops.FromUint32(uint32(kuid)) 514 i.gid = atomicbitops.FromUint32(uint32(kgid)) 515 i.ino = fs.nextInoMinusOne.Add(1) 516 // Tmpfs creation sets atime, ctime, and mtime to current time. 517 now := fs.clock.Now().Nanoseconds() 518 i.atime = atomicbitops.FromInt64(now) 519 i.ctime = atomicbitops.FromInt64(now) 520 i.mtime = atomicbitops.FromInt64(now) 521 // i.nlink initialized by caller 522 i.impl = impl 523 i.refs.InitRefs() 524 } 525 526 // incLinksLocked increments i's link count. 527 // 528 // Preconditions: 529 // - filesystem.mu must be locked for writing. 530 // - i.mu must be lcoked. 531 // - i.nlink != 0. 532 // - i.nlink < maxLinks. 533 func (i *inode) incLinksLocked() { 534 if i.nlink.RacyLoad() == 0 { 535 panic("tmpfs.inode.incLinksLocked() called with no existing links") 536 } 537 if i.nlink.RacyLoad() == maxLinks { 538 panic("tmpfs.inode.incLinksLocked() called with maximum link count") 539 } 540 i.nlink.Add(1) 541 } 542 543 // decLinksLocked decrements i's link count. If the link count reaches 0, we 544 // remove a reference on i as well. 545 // 546 // Preconditions: 547 // - filesystem.mu must be locked for writing. 548 // - i.mu must be lcoked. 549 // - i.nlink != 0. 550 func (i *inode) decLinksLocked(ctx context.Context) { 551 if i.nlink.RacyLoad() == 0 { 552 panic("tmpfs.inode.decLinksLocked() called with no existing links") 553 } 554 if i.nlink.Add(^uint32(0)) == 0 { 555 i.decRef(ctx) 556 } 557 } 558 559 func (i *inode) incRef() { 560 i.refs.IncRef() 561 } 562 563 func (i *inode) tryIncRef() bool { 564 return i.refs.TryIncRef() 565 } 566 567 func (i *inode) decRef(ctx context.Context) { 568 i.refs.DecRef(func() { 569 i.watches.HandleDeletion(ctx) 570 // Remove pages used if child being removed is a SymLink or Regular File. 571 switch impl := i.impl.(type) { 572 case *symlink: 573 if len(impl.target) >= shortSymlinkLen { 574 impl.inode.fs.unaccountPages(1) 575 } 576 case *regularFile: 577 // Release memory used by regFile to store data. Since regFile is 578 // no longer usable, we don't need to grab any locks or update any 579 // metadata. 580 pagesDec := impl.data.DropAll(i.fs.mf) 581 impl.inode.fs.unaccountPages(pagesDec) 582 } 583 584 }) 585 } 586 587 func (i *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error { 588 mode := linux.FileMode(i.mode.Load()) 589 return vfs.GenericCheckPermissions(creds, ats, mode, auth.KUID(i.uid.Load()), auth.KGID(i.gid.Load())) 590 } 591 592 // Go won't inline this function, and returning linux.Statx (which is quite 593 // big) means spending a lot of time in runtime.duffcopy(), so instead it's an 594 // output parameter. 595 // 596 // Note that Linux does not guarantee to return consistent data (in the case of 597 // a concurrent modification), so we do not require holding inode.mu. 598 func (i *inode) statTo(stat *linux.Statx) { 599 stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | 600 linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_SIZE | 601 linux.STATX_BLOCKS | linux.STATX_ATIME | linux.STATX_CTIME | 602 linux.STATX_MTIME 603 stat.Blksize = hostarch.PageSize 604 stat.Nlink = i.nlink.Load() 605 stat.UID = i.uid.Load() 606 stat.GID = i.gid.Load() 607 stat.Mode = uint16(i.mode.Load()) 608 stat.Ino = i.ino 609 stat.Atime = linux.NsecToStatxTimestamp(i.atime.Load()) 610 stat.Ctime = linux.NsecToStatxTimestamp(i.ctime.Load()) 611 stat.Mtime = linux.NsecToStatxTimestamp(i.mtime.Load()) 612 stat.DevMajor = linux.UNNAMED_MAJOR 613 stat.DevMinor = i.fs.devMinor 614 switch impl := i.impl.(type) { 615 case *regularFile: 616 stat.Size = uint64(impl.size.Load()) 617 // TODO(jamieliu): This should be impl.data.Span() / 512, but this is 618 // too expensive to compute here. Cache it in regularFile. 619 stat.Blocks = allocatedBlocksForSize(stat.Size) 620 case *directory: 621 stat.Size = direntSize * (2 + uint64(impl.numChildren.Load())) 622 // stat.Blocks is 0. 623 case *symlink: 624 stat.Size = uint64(len(impl.target)) 625 // stat.Blocks is 0. 626 case *namedPipe, *socketFile: 627 // stat.Size and stat.Blocks are 0. 628 case *deviceFile: 629 // stat.Size and stat.Blocks are 0. 630 stat.RdevMajor = impl.major 631 stat.RdevMinor = impl.minor 632 default: 633 panic(fmt.Sprintf("unknown inode type: %T", i.impl)) 634 } 635 } 636 637 func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs.SetStatOptions) error { 638 stat := &opts.Stat 639 if stat.Mask == 0 { 640 return nil 641 } 642 if stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_CTIME|linux.STATX_SIZE) != 0 { 643 return linuxerr.EPERM 644 } 645 mode := linux.FileMode(i.mode.Load()) 646 if err := vfs.CheckSetStat(ctx, creds, opts, mode, auth.KUID(i.uid.Load()), auth.KGID(i.gid.Load())); err != nil { 647 return err 648 } 649 650 i.mu.Lock() 651 defer i.mu.Unlock() 652 var ( 653 needsMtimeBump bool 654 needsCtimeBump bool 655 ) 656 clearSID := false 657 mask := stat.Mask 658 if mask&linux.STATX_SIZE != 0 { 659 switch impl := i.impl.(type) { 660 case *regularFile: 661 updated, err := impl.truncateLocked(stat.Size) 662 if err != nil { 663 return err 664 } 665 if updated { 666 clearSID = true 667 needsMtimeBump = true 668 needsCtimeBump = true 669 } 670 case *directory: 671 return linuxerr.EISDIR 672 default: 673 return linuxerr.EINVAL 674 } 675 } 676 if mask&linux.STATX_UID != 0 { 677 i.uid.Store(stat.UID) 678 needsCtimeBump = true 679 clearSID = true 680 } 681 if mask&linux.STATX_GID != 0 { 682 i.gid.Store(stat.GID) 683 needsCtimeBump = true 684 clearSID = true 685 } 686 if mask&linux.STATX_MODE != 0 { 687 for { 688 old := i.mode.Load() 689 ft := old & linux.S_IFMT 690 newMode := ft | uint32(stat.Mode & ^uint16(linux.S_IFMT)) 691 if clearSID { 692 newMode = vfs.ClearSUIDAndSGID(newMode) 693 } 694 if swapped := i.mode.CompareAndSwap(old, newMode); swapped { 695 clearSID = false 696 break 697 } 698 } 699 needsCtimeBump = true 700 } 701 now := i.fs.clock.Now().Nanoseconds() 702 if mask&linux.STATX_ATIME != 0 { 703 if stat.Atime.Nsec == linux.UTIME_NOW { 704 i.atime.Store(now) 705 } else { 706 i.atime.Store(stat.Atime.ToNsecCapped()) 707 } 708 needsCtimeBump = true 709 } 710 if mask&linux.STATX_MTIME != 0 { 711 if stat.Mtime.Nsec == linux.UTIME_NOW { 712 i.mtime.Store(now) 713 } else { 714 i.mtime.Store(stat.Mtime.ToNsecCapped()) 715 } 716 needsCtimeBump = true 717 // Ignore the mtime bump, since we just set it ourselves. 718 needsMtimeBump = false 719 } 720 if mask&linux.STATX_CTIME != 0 { 721 if stat.Ctime.Nsec == linux.UTIME_NOW { 722 i.ctime.Store(now) 723 } else { 724 i.ctime.Store(stat.Ctime.ToNsecCapped()) 725 } 726 // Ignore the ctime bump, since we just set it ourselves. 727 needsCtimeBump = false 728 } 729 730 // We may have to clear the SUID/SGID bits, but didn't do so as part of 731 // STATX_MODE. 732 if clearSID { 733 for { 734 old := i.mode.Load() 735 newMode := vfs.ClearSUIDAndSGID(old) 736 if swapped := i.mode.CompareAndSwap(old, newMode); swapped { 737 break 738 } 739 } 740 needsCtimeBump = true 741 } 742 743 if needsMtimeBump { 744 i.mtime.Store(now) 745 } 746 if needsCtimeBump { 747 i.ctime.Store(now) 748 } 749 750 return nil 751 } 752 753 // allocatedBlocksForSize returns the number of 512B blocks needed to 754 // accommodate the given size in bytes, as appropriate for struct 755 // stat::st_blocks and struct statx::stx_blocks. (Note that this 512B block 756 // size is independent of the "preferred block size for I/O", struct 757 // stat::st_blksize and struct statx::stx_blksize.) 758 func allocatedBlocksForSize(size uint64) uint64 { 759 return (size + 511) / 512 760 } 761 762 func (i *inode) direntType() uint8 { 763 switch impl := i.impl.(type) { 764 case *regularFile: 765 return linux.DT_REG 766 case *directory: 767 return linux.DT_DIR 768 case *symlink: 769 return linux.DT_LNK 770 case *socketFile: 771 return linux.DT_SOCK 772 case *namedPipe: 773 return linux.DT_FIFO 774 case *deviceFile: 775 switch impl.kind { 776 case vfs.BlockDevice: 777 return linux.DT_BLK 778 case vfs.CharDevice: 779 return linux.DT_CHR 780 default: 781 panic(fmt.Sprintf("unknown vfs.DeviceKind: %v", impl.kind)) 782 } 783 default: 784 panic(fmt.Sprintf("unknown inode type: %T", i.impl)) 785 } 786 } 787 788 func (i *inode) isDir() bool { 789 mode := linux.FileMode(i.mode.Load()) 790 return mode.FileType() == linux.S_IFDIR 791 } 792 793 func (i *inode) touchAtime(mnt *vfs.Mount) { 794 if mnt.Options().Flags.NoATime { 795 return 796 } 797 if err := mnt.CheckBeginWrite(); err != nil { 798 return 799 } 800 now := i.fs.clock.Now().Nanoseconds() 801 i.mu.Lock() 802 i.atime.Store(now) 803 i.mu.Unlock() 804 mnt.EndWrite() 805 } 806 807 // Preconditions: The caller has called vfs.Mount.CheckBeginWrite(). 808 func (i *inode) touchCtime() { 809 now := i.fs.clock.Now().Nanoseconds() 810 i.mu.Lock() 811 i.ctime.Store(now) 812 i.mu.Unlock() 813 } 814 815 // Preconditions: The caller has called vfs.Mount.CheckBeginWrite(). 816 func (i *inode) touchCMtime() { 817 now := i.fs.clock.Now().Nanoseconds() 818 i.mu.Lock() 819 i.mtime.Store(now) 820 i.ctime.Store(now) 821 i.mu.Unlock() 822 } 823 824 // Preconditions: 825 // - The caller has called vfs.Mount.CheckBeginWrite(). 826 // - inode.mu must be locked. 827 func (i *inode) touchCMtimeLocked() { 828 now := i.fs.clock.Now().Nanoseconds() 829 i.mtime.Store(now) 830 i.ctime.Store(now) 831 } 832 833 func (i *inode) checkXattrPrefix(name string) error { 834 for prefix := range i.fs.allowXattrPrefix { 835 if strings.HasPrefix(name, prefix) { 836 return nil 837 } 838 } 839 return linuxerr.EOPNOTSUPP 840 } 841 842 func (i *inode) listXattr(creds *auth.Credentials, size uint64) ([]string, error) { 843 return i.xattrs.ListXattr(creds, size) 844 } 845 846 func (i *inode) getXattr(creds *auth.Credentials, opts *vfs.GetXattrOptions) (string, error) { 847 if err := i.checkXattrPrefix(opts.Name); err != nil { 848 return "", err 849 } 850 mode := linux.FileMode(i.mode.Load()) 851 kuid := auth.KUID(i.uid.Load()) 852 kgid := auth.KGID(i.gid.Load()) 853 if err := vfs.GenericCheckPermissions(creds, vfs.MayRead, mode, kuid, kgid); err != nil { 854 return "", err 855 } 856 return i.xattrs.GetXattr(creds, mode, kuid, opts) 857 } 858 859 func (i *inode) setXattr(creds *auth.Credentials, opts *vfs.SetXattrOptions) error { 860 if err := i.checkXattrPrefix(opts.Name); err != nil { 861 return err 862 } 863 mode := linux.FileMode(i.mode.Load()) 864 kuid := auth.KUID(i.uid.Load()) 865 kgid := auth.KGID(i.gid.Load()) 866 if err := vfs.GenericCheckPermissions(creds, vfs.MayWrite, mode, kuid, kgid); err != nil { 867 return err 868 } 869 return i.xattrs.SetXattr(creds, mode, kuid, opts) 870 } 871 872 func (i *inode) removeXattr(creds *auth.Credentials, name string) error { 873 if err := i.checkXattrPrefix(name); err != nil { 874 return err 875 } 876 mode := linux.FileMode(i.mode.Load()) 877 kuid := auth.KUID(i.uid.Load()) 878 kgid := auth.KGID(i.gid.Load()) 879 if err := vfs.GenericCheckPermissions(creds, vfs.MayWrite, mode, kuid, kgid); err != nil { 880 return err 881 } 882 return i.xattrs.RemoveXattr(creds, mode, kuid, name) 883 } 884 885 // fileDescription is embedded by tmpfs implementations of 886 // vfs.FileDescriptionImpl. 887 // 888 // +stateify savable 889 type fileDescription struct { 890 vfsfd vfs.FileDescription 891 vfs.FileDescriptionDefaultImpl 892 vfs.LockFD 893 } 894 895 func (fd *fileDescription) filesystem() *filesystem { 896 return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem) 897 } 898 899 func (fd *fileDescription) dentry() *dentry { 900 return fd.vfsfd.Dentry().Impl().(*dentry) 901 } 902 903 func (fd *fileDescription) inode() *inode { 904 return fd.dentry().inode 905 } 906 907 // Stat implements vfs.FileDescriptionImpl.Stat. 908 func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { 909 var stat linux.Statx 910 fd.inode().statTo(&stat) 911 return stat, nil 912 } 913 914 // SetStat implements vfs.FileDescriptionImpl.SetStat. 915 func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { 916 return fd.dentry().inode.setStat(ctx, auth.CredentialsFromContext(ctx), &opts) 917 } 918 919 // StatFS implements vfs.FileDescriptionImpl.StatFS. 920 func (fd *fileDescription) StatFS(ctx context.Context) (linux.Statfs, error) { 921 return fd.filesystem().statFS(), nil 922 } 923 924 // ListXattr implements vfs.FileDescriptionImpl.ListXattr. 925 func (fd *fileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) { 926 return fd.inode().listXattr(auth.CredentialsFromContext(ctx), size) 927 } 928 929 // GetXattr implements vfs.FileDescriptionImpl.GetXattr. 930 func (fd *fileDescription) GetXattr(ctx context.Context, opts vfs.GetXattrOptions) (string, error) { 931 return fd.inode().getXattr(auth.CredentialsFromContext(ctx), &opts) 932 } 933 934 // SetXattr implements vfs.FileDescriptionImpl.SetXattr. 935 func (fd *fileDescription) SetXattr(ctx context.Context, opts vfs.SetXattrOptions) error { 936 return fd.dentry().inode.setXattr(auth.CredentialsFromContext(ctx), &opts) 937 } 938 939 // RemoveXattr implements vfs.FileDescriptionImpl.RemoveXattr. 940 func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error { 941 return fd.dentry().inode.removeXattr(auth.CredentialsFromContext(ctx), name) 942 } 943 944 // Sync implements vfs.FileDescriptionImpl.Sync. It does nothing because all 945 // filesystem state is in-memory. 946 func (*fileDescription) Sync(context.Context) error { 947 return nil 948 } 949 950 // parseSize converts size in string to an integer bytes. 951 // Supported suffixes in string are:K, M, G, T, P, E. 952 func parseSize(s string) (uint64, error) { 953 if len(s) == 0 { 954 return 0, fmt.Errorf("size parameter empty") 955 } 956 suffix := s[len(s)-1] 957 count := 1 958 switch suffix { 959 case 'e', 'E': 960 count = count << 10 961 fallthrough 962 case 'p', 'P': 963 count = count << 10 964 fallthrough 965 case 't', 'T': 966 count = count << 10 967 fallthrough 968 case 'g', 'G': 969 count = count << 10 970 fallthrough 971 case 'm', 'M': 972 count = count << 10 973 fallthrough 974 case 'k', 'K': 975 count = count << 10 976 s = s[:len(s)-1] 977 } 978 byteTmp, err := strconv.ParseUint(s, 10, 64) 979 if err != nil { 980 return 0, linuxerr.EINVAL 981 } 982 // Check for overflow. 983 bytes := byteTmp * uint64(count) 984 if byteTmp != 0 && bytes/byteTmp != uint64(count) { 985 return 0, fmt.Errorf("size overflow") 986 } 987 return bytes, err 988 }