github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/fsimpl/tmpfs/tmpfs.go (about) 1 // Copyright 2019 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package tmpfs provides an in-memory filesystem whose contents are 16 // application-mutable, consistent with Linux's tmpfs. 17 // 18 // Lock order: 19 // 20 // filesystem.mu 21 // inode.mu 22 // regularFileFD.offMu 23 // *** "memmap.Mappable locks" below this point 24 // regularFile.mapsMu 25 // *** "memmap.Mappable locks taken by Translate" below this point 26 // regularFile.dataMu 27 // directory.iterMu 28 package tmpfs 29 30 import ( 31 "fmt" 32 "math" 33 "strconv" 34 "strings" 35 "sync/atomic" 36 37 "github.com/SagerNet/gvisor/pkg/abi/linux" 38 "github.com/SagerNet/gvisor/pkg/context" 39 "github.com/SagerNet/gvisor/pkg/errors/linuxerr" 40 "github.com/SagerNet/gvisor/pkg/hostarch" 41 "github.com/SagerNet/gvisor/pkg/sentry/kernel/auth" 42 "github.com/SagerNet/gvisor/pkg/sentry/kernel/time" 43 "github.com/SagerNet/gvisor/pkg/sentry/pgalloc" 44 "github.com/SagerNet/gvisor/pkg/sentry/vfs" 45 "github.com/SagerNet/gvisor/pkg/sentry/vfs/memxattr" 46 "github.com/SagerNet/gvisor/pkg/sync" 47 "github.com/SagerNet/gvisor/pkg/syserror" 48 ) 49 50 // Name is the default filesystem name. 51 const Name = "tmpfs" 52 53 // FilesystemType implements vfs.FilesystemType. 54 // 55 // +stateify savable 56 type FilesystemType struct{} 57 58 // filesystem implements vfs.FilesystemImpl. 59 // 60 // +stateify savable 61 type filesystem struct { 62 vfsfs vfs.Filesystem 63 64 // mfp is used to allocate memory that stores regular file contents. mfp is 65 // immutable. 66 mfp pgalloc.MemoryFileProvider 67 68 // clock is a realtime clock used to set timestamps in file operations. 69 clock time.Clock 70 71 // devMinor is the filesystem's minor device number. devMinor is immutable. 72 devMinor uint32 73 74 // mopts contains the tmpfs-specific mount options passed to this 75 // filesystem. Immutable. 76 mopts string 77 78 // mu serializes changes to the Dentry tree. 79 mu sync.RWMutex `state:"nosave"` 80 81 nextInoMinusOne uint64 // accessed using atomic memory operations 82 83 root *dentry 84 } 85 86 // Name implements vfs.FilesystemType.Name. 87 func (FilesystemType) Name() string { 88 return Name 89 } 90 91 // Release implements vfs.FilesystemType.Release. 92 func (FilesystemType) Release(ctx context.Context) {} 93 94 // FilesystemOpts is used to pass configuration data to tmpfs. 95 // 96 // +stateify savable 97 type FilesystemOpts struct { 98 // RootFileType is the FileType of the filesystem root. Valid values 99 // are: S_IFDIR, S_IFREG, and S_IFLNK. Defaults to S_IFDIR. 100 RootFileType uint16 101 102 // RootSymlinkTarget is the target of the root symlink. Only valid if 103 // RootFileType == S_IFLNK. 104 RootSymlinkTarget string 105 106 // FilesystemType allows setting a different FilesystemType for this 107 // tmpfs filesystem. This allows tmpfs to "impersonate" other 108 // filesystems, like ramdiskfs and cgroupfs. 109 FilesystemType vfs.FilesystemType 110 } 111 112 // GetFilesystem implements vfs.FilesystemType.GetFilesystem. 113 func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, _ string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { 114 mfp := pgalloc.MemoryFileProviderFromContext(ctx) 115 if mfp == nil { 116 panic("MemoryFileProviderFromContext returned nil") 117 } 118 119 rootFileType := uint16(linux.S_IFDIR) 120 newFSType := vfs.FilesystemType(&fstype) 121 tmpfsOpts, ok := opts.InternalData.(FilesystemOpts) 122 if ok { 123 if tmpfsOpts.RootFileType != 0 { 124 rootFileType = tmpfsOpts.RootFileType 125 } 126 if tmpfsOpts.FilesystemType != nil { 127 newFSType = tmpfsOpts.FilesystemType 128 } 129 } 130 131 mopts := vfs.GenericParseMountOptions(opts.Data) 132 rootMode := linux.FileMode(0777) 133 if rootFileType == linux.S_IFDIR { 134 rootMode = 01777 135 } 136 modeStr, ok := mopts["mode"] 137 if ok { 138 delete(mopts, "mode") 139 mode, err := strconv.ParseUint(modeStr, 8, 32) 140 if err != nil { 141 ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid mode: %q", modeStr) 142 return nil, nil, linuxerr.EINVAL 143 } 144 rootMode = linux.FileMode(mode & 07777) 145 } 146 rootKUID := creds.EffectiveKUID 147 uidStr, ok := mopts["uid"] 148 if ok { 149 delete(mopts, "uid") 150 uid, err := strconv.ParseUint(uidStr, 10, 32) 151 if err != nil { 152 ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid uid: %q", uidStr) 153 return nil, nil, linuxerr.EINVAL 154 } 155 kuid := creds.UserNamespace.MapToKUID(auth.UID(uid)) 156 if !kuid.Ok() { 157 ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unmapped uid: %d", uid) 158 return nil, nil, linuxerr.EINVAL 159 } 160 rootKUID = kuid 161 } 162 rootKGID := creds.EffectiveKGID 163 gidStr, ok := mopts["gid"] 164 if ok { 165 delete(mopts, "gid") 166 gid, err := strconv.ParseUint(gidStr, 10, 32) 167 if err != nil { 168 ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid gid: %q", gidStr) 169 return nil, nil, linuxerr.EINVAL 170 } 171 kgid := creds.UserNamespace.MapToKGID(auth.GID(gid)) 172 if !kgid.Ok() { 173 ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unmapped gid: %d", gid) 174 return nil, nil, linuxerr.EINVAL 175 } 176 rootKGID = kgid 177 } 178 if len(mopts) != 0 { 179 ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unknown options: %v", mopts) 180 return nil, nil, linuxerr.EINVAL 181 } 182 183 devMinor, err := vfsObj.GetAnonBlockDevMinor() 184 if err != nil { 185 return nil, nil, err 186 } 187 clock := time.RealtimeClockFromContext(ctx) 188 fs := filesystem{ 189 mfp: mfp, 190 clock: clock, 191 devMinor: devMinor, 192 mopts: opts.Data, 193 } 194 fs.vfsfs.Init(vfsObj, newFSType, &fs) 195 196 var root *dentry 197 switch rootFileType { 198 case linux.S_IFREG: 199 root = fs.newDentry(fs.newRegularFile(rootKUID, rootKGID, rootMode, nil /* parentDir */)) 200 case linux.S_IFLNK: 201 root = fs.newDentry(fs.newSymlink(rootKUID, rootKGID, rootMode, tmpfsOpts.RootSymlinkTarget, nil /* parentDir */)) 202 case linux.S_IFDIR: 203 root = &fs.newDirectory(rootKUID, rootKGID, rootMode, nil /* parentDir */).dentry 204 default: 205 fs.vfsfs.DecRef(ctx) 206 return nil, nil, fmt.Errorf("invalid tmpfs root file type: %#o", rootFileType) 207 } 208 fs.root = root 209 return &fs.vfsfs, &root.vfsd, nil 210 } 211 212 // NewFilesystem returns a new tmpfs filesystem. 213 func NewFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) (*vfs.Filesystem, *vfs.Dentry, error) { 214 return FilesystemType{}.GetFilesystem(ctx, vfsObj, creds, "", vfs.GetFilesystemOptions{}) 215 } 216 217 // Release implements vfs.FilesystemImpl.Release. 218 func (fs *filesystem) Release(ctx context.Context) { 219 fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) 220 fs.mu.Lock() 221 if fs.root.inode.isDir() { 222 fs.root.releaseChildrenLocked(ctx) 223 } 224 fs.mu.Unlock() 225 } 226 227 // releaseChildrenLocked is called on the mount point by filesystem.Release() to 228 // destroy all objects in the mount. It performs a depth-first walk of the 229 // filesystem and "unlinks" everything by decrementing link counts 230 // appropriately. There should be no open file descriptors when this is called, 231 // so each inode should only have one outstanding reference that is removed once 232 // its link count hits zero. 233 // 234 // Note that we do not update filesystem state precisely while tearing down (for 235 // instance, the child maps are ignored)--we only care to remove all remaining 236 // references so that every filesystem object gets destroyed. Also note that we 237 // do not need to trigger DecRef on the mount point itself or any child mount; 238 // these are taken care of by the destructor of the enclosing MountNamespace. 239 // 240 // Precondition: filesystem.mu is held. 241 func (d *dentry) releaseChildrenLocked(ctx context.Context) { 242 dir := d.inode.impl.(*directory) 243 for _, child := range dir.childMap { 244 if child.inode.isDir() { 245 child.releaseChildrenLocked(ctx) 246 child.inode.decLinksLocked(ctx) // link for child/. 247 dir.inode.decLinksLocked(ctx) // link for child/.. 248 } 249 child.inode.decLinksLocked(ctx) // link for child 250 } 251 } 252 253 // immutable 254 var globalStatfs = linux.Statfs{ 255 Type: linux.TMPFS_MAGIC, 256 BlockSize: hostarch.PageSize, 257 FragmentSize: hostarch.PageSize, 258 NameLength: linux.NAME_MAX, 259 260 // tmpfs currently does not support configurable size limits. In Linux, 261 // such a tmpfs mount will return f_blocks == f_bfree == f_bavail == 0 from 262 // statfs(2). However, many applications treat this as having a size limit 263 // of 0. To work around this, claim to have a very large but non-zero size, 264 // chosen to ensure that BlockSize * Blocks does not overflow int64 (which 265 // applications may also handle incorrectly). 266 // TODO(b/29637826): allow configuring a tmpfs size and enforce it. 267 Blocks: math.MaxInt64 / hostarch.PageSize, 268 BlocksFree: math.MaxInt64 / hostarch.PageSize, 269 BlocksAvailable: math.MaxInt64 / hostarch.PageSize, 270 } 271 272 // dentry implements vfs.DentryImpl. 273 // 274 // +stateify savable 275 type dentry struct { 276 vfsd vfs.Dentry 277 278 // parent is this dentry's parent directory. Each referenced dentry holds a 279 // reference on parent.dentry. If this dentry is a filesystem root, parent 280 // is nil. parent is protected by filesystem.mu. 281 parent *dentry 282 283 // name is the name of this dentry in its parent. If this dentry is a 284 // filesystem root, name is the empty string. name is protected by 285 // filesystem.mu. 286 name string 287 288 // dentryEntry (ugh) links dentries into their parent directory.childList. 289 dentryEntry 290 291 // inode is the inode represented by this dentry. Multiple Dentries may 292 // share a single non-directory inode (with hard links). inode is 293 // immutable. 294 // 295 // tmpfs doesn't count references on dentries; because the dentry tree is 296 // the sole source of truth, it is by definition always consistent with the 297 // state of the filesystem. However, it does count references on inodes, 298 // because inode resources are released when all references are dropped. 299 // dentry therefore forwards reference counting directly to inode. 300 inode *inode 301 } 302 303 func (fs *filesystem) newDentry(inode *inode) *dentry { 304 d := &dentry{ 305 inode: inode, 306 } 307 d.vfsd.Init(d) 308 return d 309 } 310 311 // IncRef implements vfs.DentryImpl.IncRef. 312 func (d *dentry) IncRef() { 313 d.inode.incRef() 314 } 315 316 // TryIncRef implements vfs.DentryImpl.TryIncRef. 317 func (d *dentry) TryIncRef() bool { 318 return d.inode.tryIncRef() 319 } 320 321 // DecRef implements vfs.DentryImpl.DecRef. 322 func (d *dentry) DecRef(ctx context.Context) { 323 d.inode.decRef(ctx) 324 } 325 326 // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent. 327 func (d *dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) { 328 if d.inode.isDir() { 329 events |= linux.IN_ISDIR 330 } 331 332 // tmpfs never calls VFS.InvalidateDentry(), so d.vfsd.IsDead() indicates 333 // that d was deleted. 334 deleted := d.vfsd.IsDead() 335 336 d.inode.fs.mu.RLock() 337 // The ordering below is important, Linux always notifies the parent first. 338 if d.parent != nil { 339 d.parent.inode.watches.Notify(ctx, d.name, events, cookie, et, deleted) 340 } 341 d.inode.watches.Notify(ctx, "", events, cookie, et, deleted) 342 d.inode.fs.mu.RUnlock() 343 } 344 345 // Watches implements vfs.DentryImpl.Watches. 346 func (d *dentry) Watches() *vfs.Watches { 347 return &d.inode.watches 348 } 349 350 // OnZeroWatches implements vfs.Dentry.OnZeroWatches. 351 func (d *dentry) OnZeroWatches(context.Context) {} 352 353 // inode represents a filesystem object. 354 // 355 // +stateify savable 356 type inode struct { 357 // fs is the owning filesystem. fs is immutable. 358 fs *filesystem 359 360 // A reference is held on all inodes as long as they are reachable in the 361 // filesystem tree, i.e. nlink is nonzero. This reference is dropped when 362 // nlink reaches 0. 363 refs inodeRefs 364 365 // xattrs implements extended attributes. 366 // 367 // TODO(b/148380782): Support xattrs other than user.* 368 xattrs memxattr.SimpleExtendedAttributes 369 370 // Inode metadata. Writing multiple fields atomically requires holding 371 // mu, othewise atomic operations can be used. 372 mu sync.Mutex `state:"nosave"` 373 mode uint32 // file type and mode 374 nlink uint32 // protected by filesystem.mu instead of inode.mu 375 uid uint32 // auth.KUID, but stored as raw uint32 for sync/atomic 376 gid uint32 // auth.KGID, but ... 377 ino uint64 // immutable 378 379 // Linux's tmpfs has no concept of btime. 380 atime int64 // nanoseconds 381 ctime int64 // nanoseconds 382 mtime int64 // nanoseconds 383 384 locks vfs.FileLocks 385 386 // Inotify watches for this inode. 387 watches vfs.Watches 388 389 impl interface{} // immutable 390 } 391 392 const maxLinks = math.MaxUint32 393 394 func (i *inode) init(impl interface{}, fs *filesystem, kuid auth.KUID, kgid auth.KGID, mode linux.FileMode, parentDir *directory) { 395 if mode.FileType() == 0 { 396 panic("file type is required in FileMode") 397 } 398 399 // Inherit the group and setgid bit as in fs/inode.c:inode_init_owner(). 400 if parentDir != nil && parentDir.inode.mode&linux.S_ISGID == linux.S_ISGID { 401 kgid = auth.KGID(parentDir.inode.gid) 402 if mode&linux.S_IFDIR == linux.S_IFDIR { 403 mode |= linux.S_ISGID 404 } 405 } 406 407 i.fs = fs 408 i.mode = uint32(mode) 409 i.uid = uint32(kuid) 410 i.gid = uint32(kgid) 411 i.ino = atomic.AddUint64(&fs.nextInoMinusOne, 1) 412 // Tmpfs creation sets atime, ctime, and mtime to current time. 413 now := fs.clock.Now().Nanoseconds() 414 i.atime = now 415 i.ctime = now 416 i.mtime = now 417 // i.nlink initialized by caller 418 i.impl = impl 419 i.refs.InitRefs() 420 } 421 422 // incLinksLocked increments i's link count. 423 // 424 // Preconditions: 425 // * filesystem.mu must be locked for writing. 426 // * i.nlink != 0. 427 // * i.nlink < maxLinks. 428 func (i *inode) incLinksLocked() { 429 if i.nlink == 0 { 430 panic("tmpfs.inode.incLinksLocked() called with no existing links") 431 } 432 if i.nlink == maxLinks { 433 panic("tmpfs.inode.incLinksLocked() called with maximum link count") 434 } 435 atomic.AddUint32(&i.nlink, 1) 436 } 437 438 // decLinksLocked decrements i's link count. If the link count reaches 0, we 439 // remove a reference on i as well. 440 // 441 // Preconditions: 442 // * filesystem.mu must be locked for writing. 443 // * i.nlink != 0. 444 func (i *inode) decLinksLocked(ctx context.Context) { 445 if i.nlink == 0 { 446 panic("tmpfs.inode.decLinksLocked() called with no existing links") 447 } 448 if atomic.AddUint32(&i.nlink, ^uint32(0)) == 0 { 449 i.decRef(ctx) 450 } 451 } 452 453 func (i *inode) incRef() { 454 i.refs.IncRef() 455 } 456 457 func (i *inode) tryIncRef() bool { 458 return i.refs.TryIncRef() 459 } 460 461 func (i *inode) decRef(ctx context.Context) { 462 i.refs.DecRef(func() { 463 i.watches.HandleDeletion(ctx) 464 if regFile, ok := i.impl.(*regularFile); ok { 465 // Release memory used by regFile to store data. Since regFile is 466 // no longer usable, we don't need to grab any locks or update any 467 // metadata. 468 regFile.data.DropAll(regFile.memFile) 469 } 470 }) 471 } 472 473 func (i *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error { 474 mode := linux.FileMode(atomic.LoadUint32(&i.mode)) 475 return vfs.GenericCheckPermissions(creds, ats, mode, auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))) 476 } 477 478 // Go won't inline this function, and returning linux.Statx (which is quite 479 // big) means spending a lot of time in runtime.duffcopy(), so instead it's an 480 // output parameter. 481 // 482 // Note that Linux does not guarantee to return consistent data (in the case of 483 // a concurrent modification), so we do not require holding inode.mu. 484 func (i *inode) statTo(stat *linux.Statx) { 485 stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | 486 linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_SIZE | 487 linux.STATX_BLOCKS | linux.STATX_ATIME | linux.STATX_CTIME | 488 linux.STATX_MTIME 489 stat.Blksize = hostarch.PageSize 490 stat.Nlink = atomic.LoadUint32(&i.nlink) 491 stat.UID = atomic.LoadUint32(&i.uid) 492 stat.GID = atomic.LoadUint32(&i.gid) 493 stat.Mode = uint16(atomic.LoadUint32(&i.mode)) 494 stat.Ino = i.ino 495 stat.Atime = linux.NsecToStatxTimestamp(atomic.LoadInt64(&i.atime)) 496 stat.Ctime = linux.NsecToStatxTimestamp(atomic.LoadInt64(&i.ctime)) 497 stat.Mtime = linux.NsecToStatxTimestamp(atomic.LoadInt64(&i.mtime)) 498 stat.DevMajor = linux.UNNAMED_MAJOR 499 stat.DevMinor = i.fs.devMinor 500 switch impl := i.impl.(type) { 501 case *regularFile: 502 stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS 503 stat.Size = uint64(atomic.LoadUint64(&impl.size)) 504 // TODO(jamieliu): This should be impl.data.Span() / 512, but this is 505 // too expensive to compute here. Cache it in regularFile. 506 stat.Blocks = allocatedBlocksForSize(stat.Size) 507 case *directory: 508 // "20" is mm/shmem.c:BOGO_DIRENT_SIZE. 509 stat.Size = 20 * (2 + uint64(atomic.LoadInt64(&impl.numChildren))) 510 // stat.Blocks is 0. 511 case *symlink: 512 stat.Size = uint64(len(impl.target)) 513 // stat.Blocks is 0. 514 case *namedPipe, *socketFile: 515 // stat.Size and stat.Blocks are 0. 516 case *deviceFile: 517 // stat.Size and stat.Blocks are 0. 518 stat.RdevMajor = impl.major 519 stat.RdevMinor = impl.minor 520 default: 521 panic(fmt.Sprintf("unknown inode type: %T", i.impl)) 522 } 523 } 524 525 func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs.SetStatOptions) error { 526 stat := &opts.Stat 527 if stat.Mask == 0 { 528 return nil 529 } 530 if stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_CTIME|linux.STATX_SIZE) != 0 { 531 return linuxerr.EPERM 532 } 533 mode := linux.FileMode(atomic.LoadUint32(&i.mode)) 534 if err := vfs.CheckSetStat(ctx, creds, opts, mode, auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))); err != nil { 535 return err 536 } 537 538 i.mu.Lock() 539 defer i.mu.Unlock() 540 var ( 541 needsMtimeBump bool 542 needsCtimeBump bool 543 ) 544 clearSID := false 545 mask := stat.Mask 546 if mask&linux.STATX_SIZE != 0 { 547 switch impl := i.impl.(type) { 548 case *regularFile: 549 updated, err := impl.truncateLocked(stat.Size) 550 if err != nil { 551 return err 552 } 553 if updated { 554 clearSID = true 555 needsMtimeBump = true 556 needsCtimeBump = true 557 } 558 case *directory: 559 return syserror.EISDIR 560 default: 561 return linuxerr.EINVAL 562 } 563 } 564 if mask&linux.STATX_UID != 0 { 565 atomic.StoreUint32(&i.uid, stat.UID) 566 needsCtimeBump = true 567 clearSID = true 568 } 569 if mask&linux.STATX_GID != 0 { 570 atomic.StoreUint32(&i.gid, stat.GID) 571 needsCtimeBump = true 572 clearSID = true 573 } 574 if mask&linux.STATX_MODE != 0 { 575 for { 576 old := atomic.LoadUint32(&i.mode) 577 ft := old & linux.S_IFMT 578 newMode := ft | uint32(stat.Mode & ^uint16(linux.S_IFMT)) 579 if clearSID { 580 newMode = vfs.ClearSUIDAndSGID(newMode) 581 } 582 if swapped := atomic.CompareAndSwapUint32(&i.mode, old, newMode); swapped { 583 clearSID = false 584 break 585 } 586 } 587 needsCtimeBump = true 588 } 589 now := i.fs.clock.Now().Nanoseconds() 590 if mask&linux.STATX_ATIME != 0 { 591 if stat.Atime.Nsec == linux.UTIME_NOW { 592 atomic.StoreInt64(&i.atime, now) 593 } else { 594 atomic.StoreInt64(&i.atime, stat.Atime.ToNsecCapped()) 595 } 596 needsCtimeBump = true 597 } 598 if mask&linux.STATX_MTIME != 0 { 599 if stat.Mtime.Nsec == linux.UTIME_NOW { 600 atomic.StoreInt64(&i.mtime, now) 601 } else { 602 atomic.StoreInt64(&i.mtime, stat.Mtime.ToNsecCapped()) 603 } 604 needsCtimeBump = true 605 // Ignore the mtime bump, since we just set it ourselves. 606 needsMtimeBump = false 607 } 608 if mask&linux.STATX_CTIME != 0 { 609 if stat.Ctime.Nsec == linux.UTIME_NOW { 610 atomic.StoreInt64(&i.ctime, now) 611 } else { 612 atomic.StoreInt64(&i.ctime, stat.Ctime.ToNsecCapped()) 613 } 614 // Ignore the ctime bump, since we just set it ourselves. 615 needsCtimeBump = false 616 } 617 618 // We may have to clear the SUID/SGID bits, but didn't do so as part of 619 // STATX_MODE. 620 if clearSID { 621 for { 622 old := atomic.LoadUint32(&i.mode) 623 newMode := vfs.ClearSUIDAndSGID(old) 624 if swapped := atomic.CompareAndSwapUint32(&i.mode, old, newMode); swapped { 625 break 626 } 627 } 628 needsCtimeBump = true 629 } 630 631 if needsMtimeBump { 632 atomic.StoreInt64(&i.mtime, now) 633 } 634 if needsCtimeBump { 635 atomic.StoreInt64(&i.ctime, now) 636 } 637 638 return nil 639 } 640 641 // allocatedBlocksForSize returns the number of 512B blocks needed to 642 // accommodate the given size in bytes, as appropriate for struct 643 // stat::st_blocks and struct statx::stx_blocks. (Note that this 512B block 644 // size is independent of the "preferred block size for I/O", struct 645 // stat::st_blksize and struct statx::stx_blksize.) 646 func allocatedBlocksForSize(size uint64) uint64 { 647 return (size + 511) / 512 648 } 649 650 func (i *inode) direntType() uint8 { 651 switch impl := i.impl.(type) { 652 case *regularFile: 653 return linux.DT_REG 654 case *directory: 655 return linux.DT_DIR 656 case *symlink: 657 return linux.DT_LNK 658 case *socketFile: 659 return linux.DT_SOCK 660 case *namedPipe: 661 return linux.DT_FIFO 662 case *deviceFile: 663 switch impl.kind { 664 case vfs.BlockDevice: 665 return linux.DT_BLK 666 case vfs.CharDevice: 667 return linux.DT_CHR 668 default: 669 panic(fmt.Sprintf("unknown vfs.DeviceKind: %v", impl.kind)) 670 } 671 default: 672 panic(fmt.Sprintf("unknown inode type: %T", i.impl)) 673 } 674 } 675 676 func (i *inode) isDir() bool { 677 mode := linux.FileMode(atomic.LoadUint32(&i.mode)) 678 return mode.FileType() == linux.S_IFDIR 679 } 680 681 func (i *inode) touchAtime(mnt *vfs.Mount) { 682 if mnt.Flags.NoATime { 683 return 684 } 685 if err := mnt.CheckBeginWrite(); err != nil { 686 return 687 } 688 now := i.fs.clock.Now().Nanoseconds() 689 i.mu.Lock() 690 atomic.StoreInt64(&i.atime, now) 691 i.mu.Unlock() 692 mnt.EndWrite() 693 } 694 695 // Preconditions: The caller has called vfs.Mount.CheckBeginWrite(). 696 func (i *inode) touchCtime() { 697 now := i.fs.clock.Now().Nanoseconds() 698 i.mu.Lock() 699 atomic.StoreInt64(&i.ctime, now) 700 i.mu.Unlock() 701 } 702 703 // Preconditions: The caller has called vfs.Mount.CheckBeginWrite(). 704 func (i *inode) touchCMtime() { 705 now := i.fs.clock.Now().Nanoseconds() 706 i.mu.Lock() 707 atomic.StoreInt64(&i.mtime, now) 708 atomic.StoreInt64(&i.ctime, now) 709 i.mu.Unlock() 710 } 711 712 // Preconditions: 713 // * The caller has called vfs.Mount.CheckBeginWrite(). 714 // * inode.mu must be locked. 715 func (i *inode) touchCMtimeLocked() { 716 now := i.fs.clock.Now().Nanoseconds() 717 atomic.StoreInt64(&i.mtime, now) 718 atomic.StoreInt64(&i.ctime, now) 719 } 720 721 func checkXattrName(name string) error { 722 // Linux's tmpfs supports "security" and "trusted" xattr namespaces, and 723 // (depending on build configuration) POSIX ACL xattr namespaces 724 // ("system.posix_acl_access" and "system.posix_acl_default"). We don't 725 // support POSIX ACLs or the "security" namespace (b/148380782). 726 if strings.HasPrefix(name, linux.XATTR_TRUSTED_PREFIX) { 727 return nil 728 } 729 // We support the "user" namespace because we have tests that depend on 730 // this feature. 731 if strings.HasPrefix(name, linux.XATTR_USER_PREFIX) { 732 return nil 733 } 734 return syserror.EOPNOTSUPP 735 } 736 737 func (i *inode) listXattr(creds *auth.Credentials, size uint64) ([]string, error) { 738 return i.xattrs.ListXattr(creds, size) 739 } 740 741 func (i *inode) getXattr(creds *auth.Credentials, opts *vfs.GetXattrOptions) (string, error) { 742 if err := checkXattrName(opts.Name); err != nil { 743 return "", err 744 } 745 mode := linux.FileMode(atomic.LoadUint32(&i.mode)) 746 kuid := auth.KUID(atomic.LoadUint32(&i.uid)) 747 kgid := auth.KGID(atomic.LoadUint32(&i.gid)) 748 if err := vfs.GenericCheckPermissions(creds, vfs.MayRead, mode, kuid, kgid); err != nil { 749 return "", err 750 } 751 return i.xattrs.GetXattr(creds, mode, kuid, opts) 752 } 753 754 func (i *inode) setXattr(creds *auth.Credentials, opts *vfs.SetXattrOptions) error { 755 if err := checkXattrName(opts.Name); err != nil { 756 return err 757 } 758 mode := linux.FileMode(atomic.LoadUint32(&i.mode)) 759 kuid := auth.KUID(atomic.LoadUint32(&i.uid)) 760 kgid := auth.KGID(atomic.LoadUint32(&i.gid)) 761 if err := vfs.GenericCheckPermissions(creds, vfs.MayWrite, mode, kuid, kgid); err != nil { 762 return err 763 } 764 return i.xattrs.SetXattr(creds, mode, kuid, opts) 765 } 766 767 func (i *inode) removeXattr(creds *auth.Credentials, name string) error { 768 if err := checkXattrName(name); err != nil { 769 return err 770 } 771 mode := linux.FileMode(atomic.LoadUint32(&i.mode)) 772 kuid := auth.KUID(atomic.LoadUint32(&i.uid)) 773 kgid := auth.KGID(atomic.LoadUint32(&i.gid)) 774 if err := vfs.GenericCheckPermissions(creds, vfs.MayWrite, mode, kuid, kgid); err != nil { 775 return err 776 } 777 return i.xattrs.RemoveXattr(creds, mode, kuid, name) 778 } 779 780 // fileDescription is embedded by tmpfs implementations of 781 // vfs.FileDescriptionImpl. 782 // 783 // +stateify savable 784 type fileDescription struct { 785 vfsfd vfs.FileDescription 786 vfs.FileDescriptionDefaultImpl 787 vfs.LockFD 788 } 789 790 func (fd *fileDescription) filesystem() *filesystem { 791 return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem) 792 } 793 794 func (fd *fileDescription) dentry() *dentry { 795 return fd.vfsfd.Dentry().Impl().(*dentry) 796 } 797 798 func (fd *fileDescription) inode() *inode { 799 return fd.dentry().inode 800 } 801 802 // Stat implements vfs.FileDescriptionImpl.Stat. 803 func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { 804 var stat linux.Statx 805 fd.inode().statTo(&stat) 806 return stat, nil 807 } 808 809 // SetStat implements vfs.FileDescriptionImpl.SetStat. 810 func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { 811 creds := auth.CredentialsFromContext(ctx) 812 d := fd.dentry() 813 if err := d.inode.setStat(ctx, creds, &opts); err != nil { 814 return err 815 } 816 817 if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 { 818 d.InotifyWithParent(ctx, ev, 0, vfs.InodeEvent) 819 } 820 return nil 821 } 822 823 // StatFS implements vfs.FileDescriptionImpl.StatFS. 824 func (fd *fileDescription) StatFS(ctx context.Context) (linux.Statfs, error) { 825 return globalStatfs, nil 826 } 827 828 // ListXattr implements vfs.FileDescriptionImpl.ListXattr. 829 func (fd *fileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) { 830 return fd.inode().listXattr(auth.CredentialsFromContext(ctx), size) 831 } 832 833 // GetXattr implements vfs.FileDescriptionImpl.GetXattr. 834 func (fd *fileDescription) GetXattr(ctx context.Context, opts vfs.GetXattrOptions) (string, error) { 835 return fd.inode().getXattr(auth.CredentialsFromContext(ctx), &opts) 836 } 837 838 // SetXattr implements vfs.FileDescriptionImpl.SetXattr. 839 func (fd *fileDescription) SetXattr(ctx context.Context, opts vfs.SetXattrOptions) error { 840 d := fd.dentry() 841 if err := d.inode.setXattr(auth.CredentialsFromContext(ctx), &opts); err != nil { 842 return err 843 } 844 845 // Generate inotify events. 846 d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent) 847 return nil 848 } 849 850 // RemoveXattr implements vfs.FileDescriptionImpl.RemoveXattr. 851 func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error { 852 d := fd.dentry() 853 if err := d.inode.removeXattr(auth.CredentialsFromContext(ctx), name); err != nil { 854 return err 855 } 856 857 // Generate inotify events. 858 d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent) 859 return nil 860 } 861 862 // Sync implements vfs.FileDescriptionImpl.Sync. It does nothing because all 863 // filesystem state is in-memory. 864 func (*fileDescription) Sync(context.Context) error { 865 return nil 866 }