github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/fsimpl/tmpfs/tmpfs.go (about) 1 // Copyright 2019 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package tmpfs provides an in-memory filesystem whose contents are 16 // application-mutable, consistent with Linux's tmpfs. 17 // 18 // Lock order: 19 // 20 // filesystem.mu 21 // inode.mu 22 // regularFileFD.offMu 23 // *** "memmap.Mappable locks" below this point 24 // regularFile.mapsMu 25 // *** "memmap.Mappable locks taken by Translate" below this point 26 // regularFile.dataMu 27 // fs.pagesUsedMu 28 // directory.iterMu 29 package tmpfs 30 31 import ( 32 "fmt" 33 "math" 34 "strconv" 35 "strings" 36 37 "github.com/nicocha30/gvisor-ligolo/pkg/abi/linux" 38 "github.com/nicocha30/gvisor-ligolo/pkg/atomicbitops" 39 "github.com/nicocha30/gvisor-ligolo/pkg/context" 40 "github.com/nicocha30/gvisor-ligolo/pkg/errors/linuxerr" 41 "github.com/nicocha30/gvisor-ligolo/pkg/fd" 42 "github.com/nicocha30/gvisor-ligolo/pkg/hostarch" 43 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel/auth" 44 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel/time" 45 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/pgalloc" 46 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/usage" 47 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/vfs" 48 "github.com/nicocha30/gvisor-ligolo/pkg/sentry/vfs/memxattr" 49 ) 50 51 // Name is the default filesystem name. 52 const Name = "tmpfs" 53 54 // FilesystemType implements vfs.FilesystemType. 55 // 56 // +stateify savable 57 type FilesystemType struct{} 58 59 // filesystem implements vfs.FilesystemImpl. 60 // 61 // +stateify savable 62 type filesystem struct { 63 vfsfs vfs.Filesystem 64 65 // mf is used to allocate memory that stores regular file contents. mf is 66 // immutable, except it may to changed during restore. 67 mf *pgalloc.MemoryFile `state:"nosave"` 68 69 // privateMF indicates whether mf is private to this tmpfs mount. If so, 70 // tmpfs takes ownership of mf. privateMF is immutable. 71 privateMF bool 72 73 // mfp is used to provide mf, when privateMF == false. This is required to 74 // re-provide mf on restore. mfp is immutable. 75 mfp pgalloc.MemoryFileProvider 76 77 // clock is a realtime clock used to set timestamps in file operations. 78 clock time.Clock 79 80 // devMinor is the filesystem's minor device number. devMinor is immutable. 81 devMinor uint32 82 83 // mopts contains the tmpfs-specific mount options passed to this 84 // filesystem. Immutable. 85 mopts string 86 87 // usage is the memory accounting category under which pages backing 88 // files in this filesystem are accounted. 89 usage usage.MemoryKind 90 91 // mu serializes changes to the Dentry tree. 92 mu filesystemRWMutex `state:"nosave"` 93 94 nextInoMinusOne atomicbitops.Uint64 // accessed using atomic memory operations 95 96 root *dentry 97 98 maxFilenameLen int 99 100 // maxSizeInPages is the maximum permissible size for the tmpfs in terms of pages. 101 // This field is immutable. 102 maxSizeInPages uint64 103 104 // pagesUsed is the number of pages used by this filesystem. 105 pagesUsed atomicbitops.Uint64 106 } 107 108 // Name implements vfs.FilesystemType.Name. 109 func (FilesystemType) Name() string { 110 return Name 111 } 112 113 // Release implements vfs.FilesystemType.Release. 114 func (FilesystemType) Release(ctx context.Context) {} 115 116 // FilesystemOpts is used to pass configuration data to tmpfs. 117 // 118 // +stateify savable 119 type FilesystemOpts struct { 120 // RootFileType is the FileType of the filesystem root. Valid values 121 // are: S_IFDIR, S_IFREG, and S_IFLNK. Defaults to S_IFDIR. 122 RootFileType uint16 123 124 // RootSymlinkTarget is the target of the root symlink. Only valid if 125 // RootFileType == S_IFLNK. 126 RootSymlinkTarget string 127 128 // FilesystemType allows setting a different FilesystemType for this 129 // tmpfs filesystem. This allows tmpfs to "impersonate" other 130 // filesystems, like ramdiskfs and cgroupfs. 131 FilesystemType vfs.FilesystemType 132 133 // Usage is the memory accounting category under which pages backing files in 134 // the filesystem are accounted. 135 Usage *usage.MemoryKind 136 137 // MaxFilenameLen is the maximum filename length allowed by the tmpfs. 138 MaxFilenameLen int 139 140 // FilestoreFD is the FD for the memory file that will be used to store file 141 // data. If this is nil, then MemoryFileProviderFromContext() is used. 142 FilestoreFD *fd.FD 143 144 // DisableDefaultSizeLimit disables setting a default size limit. In Linux, 145 // SB_KERNMOUNT has this effect on tmpfs mounts; see mm/shmem.c:shmem_fill_super(). 146 DisableDefaultSizeLimit bool 147 } 148 149 // Default size limit mount option. It is immutable after initialization. 150 var defaultSizeLimit uint64 151 152 // SetDefaultSizeLimit configures the size limit to be used for tmpfs mounts 153 // that do not specify a size= mount option. This must be called only once, 154 // before any tmpfs filesystems are created. 155 func SetDefaultSizeLimit(sizeLimit uint64) { 156 defaultSizeLimit = sizeLimit 157 } 158 159 func getDefaultSizeLimit(disable bool) uint64 { 160 if disable || defaultSizeLimit == 0 { 161 // The size limit is used to populate statfs(2) results. If Linux tmpfs is 162 // mounted with no size option, then statfs(2) returns f_blocks == f_bfree 163 // == f_bavail == 0. However, many applications treat this as having a size 164 // limit of 0. To work around this, return a very large but non-zero size 165 // limit, chosen to ensure that it does not overflow int64. 166 return math.MaxInt64 167 } 168 return defaultSizeLimit 169 } 170 171 // GetFilesystem implements vfs.FilesystemType.GetFilesystem. 172 func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, _ string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { 173 mfp := pgalloc.MemoryFileProviderFromContext(ctx) 174 if mfp == nil { 175 panic("MemoryFileProviderFromContext returned nil") 176 } 177 mf := mfp.MemoryFile() 178 privateMF := false 179 180 rootFileType := uint16(linux.S_IFDIR) 181 disableDefaultSizeLimit := false 182 newFSType := vfs.FilesystemType(&fstype) 183 tmpfsOpts, tmpfsOptsOk := opts.InternalData.(FilesystemOpts) 184 if tmpfsOptsOk { 185 if tmpfsOpts.RootFileType != 0 { 186 rootFileType = tmpfsOpts.RootFileType 187 } 188 if tmpfsOpts.FilesystemType != nil { 189 newFSType = tmpfsOpts.FilesystemType 190 } 191 disableDefaultSizeLimit = tmpfsOpts.DisableDefaultSizeLimit 192 if tmpfsOpts.FilestoreFD != nil { 193 mfOpts := pgalloc.MemoryFileOpts{ 194 // tmpfsOpts.FilestoreFD may be backed by a file on disk (not memfd), 195 // which needs to be decommited on destroy to release disk space. 196 DecommitOnDestroy: true, 197 // sentry's seccomp filters don't allow the mmap(2) syscalls that 198 // pgalloc.IMAWorkAroundForMemFile() uses. Users of tmpfsOpts.FilestoreFD 199 // are expected to have performed the work around outside the sandbox. 200 DisableIMAWorkAround: true, 201 // Custom filestore FDs are usually backed by files on disk. Ideally we 202 // would confirm with fstatfs(2) but that is prohibited by seccomp. 203 DiskBackedFile: true, 204 } 205 var err error 206 mf, err = pgalloc.NewMemoryFile(tmpfsOpts.FilestoreFD.ReleaseToFile("overlay-filestore"), mfOpts) 207 if err != nil { 208 ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: pgalloc.NewMemoryFile failed: %v", err) 209 return nil, nil, err 210 } 211 privateMF = true 212 } 213 } 214 215 mopts := vfs.GenericParseMountOptions(opts.Data) 216 rootMode := linux.FileMode(0777) 217 if rootFileType == linux.S_IFDIR { 218 rootMode = 01777 219 } 220 modeStr, ok := mopts["mode"] 221 if ok { 222 delete(mopts, "mode") 223 mode, err := strconv.ParseUint(modeStr, 8, 32) 224 if err != nil { 225 ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid mode: %q", modeStr) 226 return nil, nil, linuxerr.EINVAL 227 } 228 rootMode = linux.FileMode(mode & 07777) 229 } 230 rootKUID := creds.EffectiveKUID 231 uidStr, ok := mopts["uid"] 232 if ok { 233 delete(mopts, "uid") 234 uid, err := strconv.ParseUint(uidStr, 10, 32) 235 if err != nil { 236 ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid uid: %q", uidStr) 237 return nil, nil, linuxerr.EINVAL 238 } 239 kuid := creds.UserNamespace.MapToKUID(auth.UID(uid)) 240 if !kuid.Ok() { 241 ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unmapped uid: %d", uid) 242 return nil, nil, linuxerr.EINVAL 243 } 244 rootKUID = kuid 245 } 246 rootKGID := creds.EffectiveKGID 247 gidStr, ok := mopts["gid"] 248 if ok { 249 delete(mopts, "gid") 250 gid, err := strconv.ParseUint(gidStr, 10, 32) 251 if err != nil { 252 ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid gid: %q", gidStr) 253 return nil, nil, linuxerr.EINVAL 254 } 255 kgid := creds.UserNamespace.MapToKGID(auth.GID(gid)) 256 if !kgid.Ok() { 257 ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unmapped gid: %d", gid) 258 return nil, nil, linuxerr.EINVAL 259 } 260 rootKGID = kgid 261 } 262 maxSizeInPages := getDefaultSizeLimit(disableDefaultSizeLimit) / hostarch.PageSize 263 maxSizeStr, ok := mopts["size"] 264 if ok { 265 delete(mopts, "size") 266 maxSizeInBytes, err := parseSize(maxSizeStr) 267 if err != nil { 268 ctx.Debugf("tmpfs.FilesystemType.GetFilesystem: parseSize() failed: %v", err) 269 return nil, nil, linuxerr.EINVAL 270 } 271 // Convert size in bytes to nearest Page Size bytes 272 // as Linux allocates memory in terms of Page size. 273 maxSizeInPages, ok = hostarch.ToPagesRoundUp(maxSizeInBytes) 274 if !ok { 275 ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: Pages RoundUp Overflow error: %q", ok) 276 return nil, nil, linuxerr.EINVAL 277 } 278 } 279 280 if len(mopts) != 0 { 281 ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unknown options: %v", mopts) 282 return nil, nil, linuxerr.EINVAL 283 } 284 285 devMinor, err := vfsObj.GetAnonBlockDevMinor() 286 if err != nil { 287 return nil, nil, err 288 } 289 clock := time.RealtimeClockFromContext(ctx) 290 memUsage := usage.Tmpfs 291 if tmpfsOpts.Usage != nil { 292 memUsage = *tmpfsOpts.Usage 293 } 294 fs := filesystem{ 295 mf: mf, 296 privateMF: privateMF, 297 mfp: mfp, 298 clock: clock, 299 devMinor: devMinor, 300 mopts: opts.Data, 301 usage: memUsage, 302 maxFilenameLen: linux.NAME_MAX, 303 maxSizeInPages: maxSizeInPages, 304 } 305 fs.vfsfs.Init(vfsObj, newFSType, &fs) 306 if tmpfsOptsOk && tmpfsOpts.MaxFilenameLen > 0 { 307 fs.maxFilenameLen = tmpfsOpts.MaxFilenameLen 308 } 309 310 var root *dentry 311 switch rootFileType { 312 case linux.S_IFREG: 313 root = fs.newDentry(fs.newRegularFile(rootKUID, rootKGID, rootMode, nil /* parentDir */)) 314 case linux.S_IFLNK: 315 root = fs.newDentry(fs.newSymlink(rootKUID, rootKGID, rootMode, tmpfsOpts.RootSymlinkTarget, nil /* parentDir */)) 316 case linux.S_IFDIR: 317 root = &fs.newDirectory(rootKUID, rootKGID, rootMode, nil /* parentDir */).dentry 318 default: 319 fs.vfsfs.DecRef(ctx) 320 return nil, nil, fmt.Errorf("invalid tmpfs root file type: %#o", rootFileType) 321 } 322 fs.root = root 323 return &fs.vfsfs, &root.vfsd, nil 324 } 325 326 // Release implements vfs.FilesystemImpl.Release. 327 func (fs *filesystem) Release(ctx context.Context) { 328 fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) 329 fs.mu.Lock() 330 if fs.root.inode.isDir() { 331 fs.root.releaseChildrenLocked(ctx) 332 } 333 fs.mu.Unlock() 334 if fs.privateMF { 335 fs.mf.Destroy() 336 } 337 } 338 339 // releaseChildrenLocked is called on the mount point by filesystem.Release() to 340 // destroy all objects in the mount. It performs a depth-first walk of the 341 // filesystem and "unlinks" everything by decrementing link counts 342 // appropriately. There should be no open file descriptors when this is called, 343 // so each inode should only have one outstanding reference that is removed once 344 // its link count hits zero. 345 // 346 // Note that we do not update filesystem state precisely while tearing down (for 347 // instance, the child maps are ignored)--we only care to remove all remaining 348 // references so that every filesystem object gets destroyed. Also note that we 349 // do not need to trigger DecRef on the mount point itself or any child mount; 350 // these are taken care of by the destructor of the enclosing MountNamespace. 351 // 352 // Precondition: filesystem.mu is held. 353 func (d *dentry) releaseChildrenLocked(ctx context.Context) { 354 dir := d.inode.impl.(*directory) 355 for _, child := range dir.childMap { 356 if child.inode.isDir() { 357 child.releaseChildrenLocked(ctx) 358 child.inode.decLinksLocked(ctx) // link for child/. 359 dir.inode.decLinksLocked(ctx) // link for child/.. 360 } 361 child.inode.decLinksLocked(ctx) // link for child 362 } 363 } 364 365 func (fs *filesystem) statFS() linux.Statfs { 366 st := linux.Statfs{ 367 Type: linux.TMPFS_MAGIC, 368 BlockSize: hostarch.PageSize, 369 FragmentSize: hostarch.PageSize, 370 NameLength: linux.NAME_MAX, 371 } 372 373 // If size is set for tmpfs return set values. 374 st.Blocks = fs.maxSizeInPages 375 pagesUsed := fs.pagesUsed.Load() 376 st.BlocksFree = fs.maxSizeInPages - pagesUsed 377 st.BlocksAvailable = fs.maxSizeInPages - pagesUsed 378 return st 379 } 380 381 // dentry implements vfs.DentryImpl. 382 // 383 // +stateify savable 384 type dentry struct { 385 vfsd vfs.Dentry 386 387 // parent is this dentry's parent directory. Each referenced dentry holds a 388 // reference on parent.dentry. If this dentry is a filesystem root, parent 389 // is nil. parent is protected by filesystem.mu. 390 parent *dentry 391 392 // name is the name of this dentry in its parent. If this dentry is a 393 // filesystem root, name is the empty string. name is protected by 394 // filesystem.mu. 395 name string 396 397 // dentryEntry (ugh) links dentries into their parent directory.childList. 398 dentryEntry 399 400 // inode is the inode represented by this dentry. Multiple Dentries may 401 // share a single non-directory inode (with hard links). inode is 402 // immutable. 403 // 404 // tmpfs doesn't count references on dentries; because the dentry tree is 405 // the sole source of truth, it is by definition always consistent with the 406 // state of the filesystem. However, it does count references on inodes, 407 // because inode resources are released when all references are dropped. 408 // dentry therefore forwards reference counting directly to inode. 409 inode *inode 410 } 411 412 func (fs *filesystem) newDentry(inode *inode) *dentry { 413 d := &dentry{ 414 inode: inode, 415 } 416 d.vfsd.Init(d) 417 return d 418 } 419 420 // IncRef implements vfs.DentryImpl.IncRef. 421 func (d *dentry) IncRef() { 422 d.inode.incRef() 423 } 424 425 // TryIncRef implements vfs.DentryImpl.TryIncRef. 426 func (d *dentry) TryIncRef() bool { 427 return d.inode.tryIncRef() 428 } 429 430 // DecRef implements vfs.DentryImpl.DecRef. 431 func (d *dentry) DecRef(ctx context.Context) { 432 d.inode.decRef(ctx) 433 } 434 435 // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. 436 func (d *dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) { 437 if d.inode.isDir() { 438 events |= linux.IN_ISDIR 439 } 440 441 // tmpfs never calls VFS.InvalidateDentry(), so d.vfsd.IsDead() indicates 442 // that d was deleted. 443 deleted := d.vfsd.IsDead() 444 445 d.inode.fs.mu.RLock() 446 // The ordering below is important, Linux always notifies the parent first. 447 if d.parent != nil { 448 d.parent.inode.watches.Notify(ctx, d.name, events, cookie, et, deleted) 449 } 450 d.inode.watches.Notify(ctx, "", events, cookie, et, deleted) 451 d.inode.fs.mu.RUnlock() 452 } 453 454 // Watches implements vfs.DentryImpl.Watches. 455 func (d *dentry) Watches() *vfs.Watches { 456 return &d.inode.watches 457 } 458 459 // OnZeroWatches implements vfs.Dentry.OnZeroWatches. 460 func (d *dentry) OnZeroWatches(context.Context) {} 461 462 // inode represents a filesystem object. 463 // 464 // +stateify savable 465 type inode struct { 466 // fs is the owning filesystem. fs is immutable. 467 fs *filesystem 468 469 // A reference is held on all inodes as long as they are reachable in the 470 // filesystem tree, i.e. nlink is nonzero. This reference is dropped when 471 // nlink reaches 0. 472 refs inodeRefs 473 474 // xattrs implements extended attributes. 475 // 476 // TODO(b/148380782): Support xattrs other than user.* 477 xattrs memxattr.SimpleExtendedAttributes 478 479 // Inode metadata. Writing multiple fields atomically requires holding 480 // mu, othewise atomic operations can be used. 481 mu inodeMutex `state:"nosave"` 482 mode atomicbitops.Uint32 // file type and mode 483 nlink atomicbitops.Uint32 // protected by filesystem.mu instead of inode.mu 484 uid atomicbitops.Uint32 // auth.KUID, but stored as raw uint32 for sync/atomic 485 gid atomicbitops.Uint32 // auth.KGID, but ... 486 ino uint64 // immutable 487 488 // Linux's tmpfs has no concept of btime. 489 atime atomicbitops.Int64 // nanoseconds 490 ctime atomicbitops.Int64 // nanoseconds 491 mtime atomicbitops.Int64 // nanoseconds 492 493 locks vfs.FileLocks 494 495 // Inotify watches for this inode. 496 watches vfs.Watches 497 498 impl any // immutable 499 } 500 501 const maxLinks = math.MaxUint32 502 503 func (i *inode) init(impl any, fs *filesystem, kuid auth.KUID, kgid auth.KGID, mode linux.FileMode, parentDir *directory) { 504 if mode.FileType() == 0 { 505 panic("file type is required in FileMode") 506 } 507 508 // Inherit the group and setgid bit as in fs/inode.c:inode_init_owner(). 509 if parentDir != nil && parentDir.inode.mode.Load()&linux.S_ISGID == linux.S_ISGID { 510 kgid = auth.KGID(parentDir.inode.gid.Load()) 511 if mode&linux.S_IFDIR == linux.S_IFDIR { 512 mode |= linux.S_ISGID 513 } 514 } 515 516 i.fs = fs 517 i.mode = atomicbitops.FromUint32(uint32(mode)) 518 i.uid = atomicbitops.FromUint32(uint32(kuid)) 519 i.gid = atomicbitops.FromUint32(uint32(kgid)) 520 i.ino = fs.nextInoMinusOne.Add(1) 521 // Tmpfs creation sets atime, ctime, and mtime to current time. 522 now := fs.clock.Now().Nanoseconds() 523 i.atime = atomicbitops.FromInt64(now) 524 i.ctime = atomicbitops.FromInt64(now) 525 i.mtime = atomicbitops.FromInt64(now) 526 // i.nlink initialized by caller 527 i.impl = impl 528 i.refs.InitRefs() 529 } 530 531 // incLinksLocked increments i's link count. 532 // 533 // Preconditions: 534 // - filesystem.mu must be locked for writing. 535 // - i.mu must be lcoked. 536 // - i.nlink != 0. 537 // - i.nlink < maxLinks. 538 func (i *inode) incLinksLocked() { 539 if i.nlink.RacyLoad() == 0 { 540 panic("tmpfs.inode.incLinksLocked() called with no existing links") 541 } 542 if i.nlink.RacyLoad() == maxLinks { 543 panic("tmpfs.inode.incLinksLocked() called with maximum link count") 544 } 545 i.nlink.Add(1) 546 } 547 548 // decLinksLocked decrements i's link count. If the link count reaches 0, we 549 // remove a reference on i as well. 550 // 551 // Preconditions: 552 // - filesystem.mu must be locked for writing. 553 // - i.mu must be lcoked. 554 // - i.nlink != 0. 555 func (i *inode) decLinksLocked(ctx context.Context) { 556 if i.nlink.RacyLoad() == 0 { 557 panic("tmpfs.inode.decLinksLocked() called with no existing links") 558 } 559 if i.nlink.Add(^uint32(0)) == 0 { 560 i.decRef(ctx) 561 } 562 } 563 564 func (i *inode) incRef() { 565 i.refs.IncRef() 566 } 567 568 func (i *inode) tryIncRef() bool { 569 return i.refs.TryIncRef() 570 } 571 572 func (i *inode) decRef(ctx context.Context) { 573 i.refs.DecRef(func() { 574 i.watches.HandleDeletion(ctx) 575 // Remove pages used if child being removed is a SymLink or Regular File. 576 switch impl := i.impl.(type) { 577 case *symlink: 578 if len(impl.target) >= shortSymlinkLen { 579 impl.inode.fs.unaccountPages(1) 580 } 581 case *regularFile: 582 // Release memory used by regFile to store data. Since regFile is 583 // no longer usable, we don't need to grab any locks or update any 584 // metadata. 585 pagesDec := impl.data.DropAll(i.fs.mf) 586 impl.inode.fs.unaccountPages(pagesDec) 587 } 588 589 }) 590 } 591 592 func (i *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error { 593 mode := linux.FileMode(i.mode.Load()) 594 return vfs.GenericCheckPermissions(creds, ats, mode, auth.KUID(i.uid.Load()), auth.KGID(i.gid.Load())) 595 } 596 597 // Go won't inline this function, and returning linux.Statx (which is quite 598 // big) means spending a lot of time in runtime.duffcopy(), so instead it's an 599 // output parameter. 600 // 601 // Note that Linux does not guarantee to return consistent data (in the case of 602 // a concurrent modification), so we do not require holding inode.mu. 603 func (i *inode) statTo(stat *linux.Statx) { 604 stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | 605 linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_SIZE | 606 linux.STATX_BLOCKS | linux.STATX_ATIME | linux.STATX_CTIME | 607 linux.STATX_MTIME 608 stat.Blksize = hostarch.PageSize 609 stat.Nlink = i.nlink.Load() 610 stat.UID = i.uid.Load() 611 stat.GID = i.gid.Load() 612 stat.Mode = uint16(i.mode.Load()) 613 stat.Ino = i.ino 614 stat.Atime = linux.NsecToStatxTimestamp(i.atime.Load()) 615 stat.Ctime = linux.NsecToStatxTimestamp(i.ctime.Load()) 616 stat.Mtime = linux.NsecToStatxTimestamp(i.mtime.Load()) 617 stat.DevMajor = linux.UNNAMED_MAJOR 618 stat.DevMinor = i.fs.devMinor 619 switch impl := i.impl.(type) { 620 case *regularFile: 621 stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS 622 stat.Size = uint64(impl.size.Load()) 623 // TODO(jamieliu): This should be impl.data.Span() / 512, but this is 624 // too expensive to compute here. Cache it in regularFile. 625 stat.Blocks = allocatedBlocksForSize(stat.Size) 626 case *directory: 627 stat.Size = direntSize * (2 + uint64(impl.numChildren.Load())) 628 // stat.Blocks is 0. 629 case *symlink: 630 stat.Size = uint64(len(impl.target)) 631 // stat.Blocks is 0. 632 case *namedPipe, *socketFile: 633 // stat.Size and stat.Blocks are 0. 634 case *deviceFile: 635 // stat.Size and stat.Blocks are 0. 636 stat.RdevMajor = impl.major 637 stat.RdevMinor = impl.minor 638 default: 639 panic(fmt.Sprintf("unknown inode type: %T", i.impl)) 640 } 641 } 642 643 func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs.SetStatOptions) error { 644 stat := &opts.Stat 645 if stat.Mask == 0 { 646 return nil 647 } 648 if stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_CTIME|linux.STATX_SIZE) != 0 { 649 return linuxerr.EPERM 650 } 651 mode := linux.FileMode(i.mode.Load()) 652 if err := vfs.CheckSetStat(ctx, creds, opts, mode, auth.KUID(i.uid.Load()), auth.KGID(i.gid.Load())); err != nil { 653 return err 654 } 655 656 i.mu.Lock() 657 defer i.mu.Unlock() 658 var ( 659 needsMtimeBump bool 660 needsCtimeBump bool 661 ) 662 clearSID := false 663 mask := stat.Mask 664 if mask&linux.STATX_SIZE != 0 { 665 switch impl := i.impl.(type) { 666 case *regularFile: 667 updated, err := impl.truncateLocked(stat.Size) 668 if err != nil { 669 return err 670 } 671 if updated { 672 clearSID = true 673 needsMtimeBump = true 674 needsCtimeBump = true 675 } 676 case *directory: 677 return linuxerr.EISDIR 678 default: 679 return linuxerr.EINVAL 680 } 681 } 682 if mask&linux.STATX_UID != 0 { 683 i.uid.Store(stat.UID) 684 needsCtimeBump = true 685 clearSID = true 686 } 687 if mask&linux.STATX_GID != 0 { 688 i.gid.Store(stat.GID) 689 needsCtimeBump = true 690 clearSID = true 691 } 692 if mask&linux.STATX_MODE != 0 { 693 for { 694 old := i.mode.Load() 695 ft := old & linux.S_IFMT 696 newMode := ft | uint32(stat.Mode & ^uint16(linux.S_IFMT)) 697 if clearSID { 698 newMode = vfs.ClearSUIDAndSGID(newMode) 699 } 700 if swapped := i.mode.CompareAndSwap(old, newMode); swapped { 701 clearSID = false 702 break 703 } 704 } 705 needsCtimeBump = true 706 } 707 now := i.fs.clock.Now().Nanoseconds() 708 if mask&linux.STATX_ATIME != 0 { 709 if stat.Atime.Nsec == linux.UTIME_NOW { 710 i.atime.Store(now) 711 } else { 712 i.atime.Store(stat.Atime.ToNsecCapped()) 713 } 714 needsCtimeBump = true 715 } 716 if mask&linux.STATX_MTIME != 0 { 717 if stat.Mtime.Nsec == linux.UTIME_NOW { 718 i.mtime.Store(now) 719 } else { 720 i.mtime.Store(stat.Mtime.ToNsecCapped()) 721 } 722 needsCtimeBump = true 723 // Ignore the mtime bump, since we just set it ourselves. 724 needsMtimeBump = false 725 } 726 if mask&linux.STATX_CTIME != 0 { 727 if stat.Ctime.Nsec == linux.UTIME_NOW { 728 i.ctime.Store(now) 729 } else { 730 i.ctime.Store(stat.Ctime.ToNsecCapped()) 731 } 732 // Ignore the ctime bump, since we just set it ourselves. 733 needsCtimeBump = false 734 } 735 736 // We may have to clear the SUID/SGID bits, but didn't do so as part of 737 // STATX_MODE. 738 if clearSID { 739 for { 740 old := i.mode.Load() 741 newMode := vfs.ClearSUIDAndSGID(old) 742 if swapped := i.mode.CompareAndSwap(old, newMode); swapped { 743 break 744 } 745 } 746 needsCtimeBump = true 747 } 748 749 if needsMtimeBump { 750 i.mtime.Store(now) 751 } 752 if needsCtimeBump { 753 i.ctime.Store(now) 754 } 755 756 return nil 757 } 758 759 // allocatedBlocksForSize returns the number of 512B blocks needed to 760 // accommodate the given size in bytes, as appropriate for struct 761 // stat::st_blocks and struct statx::stx_blocks. (Note that this 512B block 762 // size is independent of the "preferred block size for I/O", struct 763 // stat::st_blksize and struct statx::stx_blksize.) 764 func allocatedBlocksForSize(size uint64) uint64 { 765 return (size + 511) / 512 766 } 767 768 func (i *inode) direntType() uint8 { 769 switch impl := i.impl.(type) { 770 case *regularFile: 771 return linux.DT_REG 772 case *directory: 773 return linux.DT_DIR 774 case *symlink: 775 return linux.DT_LNK 776 case *socketFile: 777 return linux.DT_SOCK 778 case *namedPipe: 779 return linux.DT_FIFO 780 case *deviceFile: 781 switch impl.kind { 782 case vfs.BlockDevice: 783 return linux.DT_BLK 784 case vfs.CharDevice: 785 return linux.DT_CHR 786 default: 787 panic(fmt.Sprintf("unknown vfs.DeviceKind: %v", impl.kind)) 788 } 789 default: 790 panic(fmt.Sprintf("unknown inode type: %T", i.impl)) 791 } 792 } 793 794 func (i *inode) isDir() bool { 795 mode := linux.FileMode(i.mode.Load()) 796 return mode.FileType() == linux.S_IFDIR 797 } 798 799 func (i *inode) touchAtime(mnt *vfs.Mount) { 800 if mnt.Flags.NoATime { 801 return 802 } 803 if err := mnt.CheckBeginWrite(); err != nil { 804 return 805 } 806 now := i.fs.clock.Now().Nanoseconds() 807 i.mu.Lock() 808 i.atime.Store(now) 809 i.mu.Unlock() 810 mnt.EndWrite() 811 } 812 813 // Preconditions: The caller has called vfs.Mount.CheckBeginWrite(). 814 func (i *inode) touchCtime() { 815 now := i.fs.clock.Now().Nanoseconds() 816 i.mu.Lock() 817 i.ctime.Store(now) 818 i.mu.Unlock() 819 } 820 821 // Preconditions: The caller has called vfs.Mount.CheckBeginWrite(). 822 func (i *inode) touchCMtime() { 823 now := i.fs.clock.Now().Nanoseconds() 824 i.mu.Lock() 825 i.mtime.Store(now) 826 i.ctime.Store(now) 827 i.mu.Unlock() 828 } 829 830 // Preconditions: 831 // - The caller has called vfs.Mount.CheckBeginWrite(). 832 // - inode.mu must be locked. 833 func (i *inode) touchCMtimeLocked() { 834 now := i.fs.clock.Now().Nanoseconds() 835 i.mtime.Store(now) 836 i.ctime.Store(now) 837 } 838 839 func checkXattrName(name string) error { 840 // Linux's tmpfs supports "security" and "trusted" xattr namespaces, and 841 // (depending on build configuration) POSIX ACL xattr namespaces 842 // ("system.posix_acl_access" and "system.posix_acl_default"). We don't 843 // support POSIX ACLs or the "security" namespace (b/148380782). 844 if strings.HasPrefix(name, linux.XATTR_TRUSTED_PREFIX) { 845 return nil 846 } 847 // We support the "user" namespace because we have tests that depend on 848 // this feature. 849 if strings.HasPrefix(name, linux.XATTR_USER_PREFIX) { 850 return nil 851 } 852 return linuxerr.EOPNOTSUPP 853 } 854 855 func (i *inode) listXattr(creds *auth.Credentials, size uint64) ([]string, error) { 856 return i.xattrs.ListXattr(creds, size) 857 } 858 859 func (i *inode) getXattr(creds *auth.Credentials, opts *vfs.GetXattrOptions) (string, error) { 860 if err := checkXattrName(opts.Name); err != nil { 861 return "", err 862 } 863 mode := linux.FileMode(i.mode.Load()) 864 kuid := auth.KUID(i.uid.Load()) 865 kgid := auth.KGID(i.gid.Load()) 866 if err := vfs.GenericCheckPermissions(creds, vfs.MayRead, mode, kuid, kgid); err != nil { 867 return "", err 868 } 869 return i.xattrs.GetXattr(creds, mode, kuid, opts) 870 } 871 872 func (i *inode) setXattr(creds *auth.Credentials, opts *vfs.SetXattrOptions) error { 873 if err := checkXattrName(opts.Name); err != nil { 874 return err 875 } 876 mode := linux.FileMode(i.mode.Load()) 877 kuid := auth.KUID(i.uid.Load()) 878 kgid := auth.KGID(i.gid.Load()) 879 if err := vfs.GenericCheckPermissions(creds, vfs.MayWrite, mode, kuid, kgid); err != nil { 880 return err 881 } 882 return i.xattrs.SetXattr(creds, mode, kuid, opts) 883 } 884 885 func (i *inode) removeXattr(creds *auth.Credentials, name string) error { 886 if err := checkXattrName(name); err != nil { 887 return err 888 } 889 mode := linux.FileMode(i.mode.Load()) 890 kuid := auth.KUID(i.uid.Load()) 891 kgid := auth.KGID(i.gid.Load()) 892 if err := vfs.GenericCheckPermissions(creds, vfs.MayWrite, mode, kuid, kgid); err != nil { 893 return err 894 } 895 return i.xattrs.RemoveXattr(creds, mode, kuid, name) 896 } 897 898 // fileDescription is embedded by tmpfs implementations of 899 // vfs.FileDescriptionImpl. 900 // 901 // +stateify savable 902 type fileDescription struct { 903 vfsfd vfs.FileDescription 904 vfs.FileDescriptionDefaultImpl 905 vfs.LockFD 906 } 907 908 func (fd *fileDescription) filesystem() *filesystem { 909 return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem) 910 } 911 912 func (fd *fileDescription) dentry() *dentry { 913 return fd.vfsfd.Dentry().Impl().(*dentry) 914 } 915 916 func (fd *fileDescription) inode() *inode { 917 return fd.dentry().inode 918 } 919 920 // Stat implements vfs.FileDescriptionImpl.Stat. 921 func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { 922 var stat linux.Statx 923 fd.inode().statTo(&stat) 924 return stat, nil 925 } 926 927 // SetStat implements vfs.FileDescriptionImpl.SetStat. 928 func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { 929 return fd.dentry().inode.setStat(ctx, auth.CredentialsFromContext(ctx), &opts) 930 } 931 932 // StatFS implements vfs.FileDescriptionImpl.StatFS. 933 func (fd *fileDescription) StatFS(ctx context.Context) (linux.Statfs, error) { 934 return fd.filesystem().statFS(), nil 935 } 936 937 // ListXattr implements vfs.FileDescriptionImpl.ListXattr. 938 func (fd *fileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) { 939 return fd.inode().listXattr(auth.CredentialsFromContext(ctx), size) 940 } 941 942 // GetXattr implements vfs.FileDescriptionImpl.GetXattr. 943 func (fd *fileDescription) GetXattr(ctx context.Context, opts vfs.GetXattrOptions) (string, error) { 944 return fd.inode().getXattr(auth.CredentialsFromContext(ctx), &opts) 945 } 946 947 // SetXattr implements vfs.FileDescriptionImpl.SetXattr. 948 func (fd *fileDescription) SetXattr(ctx context.Context, opts vfs.SetXattrOptions) error { 949 return fd.dentry().inode.setXattr(auth.CredentialsFromContext(ctx), &opts) 950 } 951 952 // RemoveXattr implements vfs.FileDescriptionImpl.RemoveXattr. 953 func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error { 954 return fd.dentry().inode.removeXattr(auth.CredentialsFromContext(ctx), name) 955 } 956 957 // Sync implements vfs.FileDescriptionImpl.Sync. It does nothing because all 958 // filesystem state is in-memory. 959 func (*fileDescription) Sync(context.Context) error { 960 return nil 961 } 962 963 // parseSize converts size in string to an integer bytes. 964 // Supported suffixes in string are:K, M, G, T, P, E. 965 func parseSize(s string) (uint64, error) { 966 if len(s) == 0 { 967 return 0, fmt.Errorf("size parameter empty") 968 } 969 suffix := s[len(s)-1] 970 count := 1 971 switch suffix { 972 case 'e', 'E': 973 count = count << 10 974 fallthrough 975 case 'p', 'P': 976 count = count << 10 977 fallthrough 978 case 't', 'T': 979 count = count << 10 980 fallthrough 981 case 'g', 'G': 982 count = count << 10 983 fallthrough 984 case 'm', 'M': 985 count = count << 10 986 fallthrough 987 case 'k', 'K': 988 count = count << 10 989 s = s[:len(s)-1] 990 } 991 byteTmp, err := strconv.ParseUint(s, 10, 64) 992 if err != nil { 993 return 0, linuxerr.EINVAL 994 } 995 // Check for overflow. 996 bytes := byteTmp * uint64(count) 997 if byteTmp != 0 && bytes/byteTmp != uint64(count) { 998 return 0, fmt.Errorf("size overflow") 999 } 1000 return bytes, err 1001 }