github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/fsimpl/tmpfs/regular_file.go (about) 1 // Copyright 2019 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package tmpfs 16 17 import ( 18 "fmt" 19 "io" 20 "math" 21 22 "github.com/metacubex/gvisor/pkg/abi/linux" 23 "github.com/metacubex/gvisor/pkg/atomicbitops" 24 "github.com/metacubex/gvisor/pkg/context" 25 "github.com/metacubex/gvisor/pkg/errors/linuxerr" 26 "github.com/metacubex/gvisor/pkg/hostarch" 27 "github.com/metacubex/gvisor/pkg/safemem" 28 "github.com/metacubex/gvisor/pkg/sentry/fsmetric" 29 "github.com/metacubex/gvisor/pkg/sentry/fsutil" 30 "github.com/metacubex/gvisor/pkg/sentry/hostfd" 31 "github.com/metacubex/gvisor/pkg/sentry/kernel/auth" 32 "github.com/metacubex/gvisor/pkg/sentry/memmap" 33 "github.com/metacubex/gvisor/pkg/sentry/pgalloc" 34 "github.com/metacubex/gvisor/pkg/sentry/usage" 35 "github.com/metacubex/gvisor/pkg/sentry/vfs" 36 "github.com/metacubex/gvisor/pkg/sync" 37 "github.com/metacubex/gvisor/pkg/usermem" 38 ) 39 40 // regularFile is a regular (=S_IFREG) tmpfs file. 41 // 42 // +stateify savable 43 type regularFile struct { 44 inode inode 45 46 // memoryUsageKind is the memory accounting category under which pages backing 47 // this regularFile's contents are accounted. 48 memoryUsageKind usage.MemoryKind 49 50 // mapsMu protects mappings. 51 mapsMu sync.Mutex `state:"nosave"` 52 53 // mappings tracks mappings of the file into memmap.MappingSpaces. 54 // 55 // Protected by mapsMu. 56 mappings memmap.MappingSet 57 58 // writableMappingPages tracks how many pages of virtual memory are mapped 59 // as potentially writable from this file. If a page has multiple mappings, 60 // each mapping is counted separately. 61 // 62 // This counter is susceptible to overflow as we can potentially count 63 // mappings from many VMAs. We count pages rather than bytes to slightly 64 // mitigate this. 65 // 66 // Protected by mapsMu. 67 writableMappingPages uint64 68 69 // dataMu protects the fields below. 70 dataMu sync.RWMutex `state:"nosave"` 71 72 // data maps offsets into the file to offsets into memFile that store 73 // the file's data. 74 // 75 // Protected by dataMu. 76 data fsutil.FileRangeSet 77 78 // seals represents file seals on this inode. 79 // 80 // Protected by dataMu. 81 seals uint32 82 83 // size is the size of data. 84 // 85 // Protected by both dataMu and inode.mu; reading it requires holding 86 // either mutex, while writing requires holding both AND using atomics. 87 // Readers that do not require consistency (like Stat) may read the 88 // value atomically without holding either lock. 89 size atomicbitops.Uint64 90 } 91 92 func (fs *filesystem) newRegularFile(kuid auth.KUID, kgid auth.KGID, mode linux.FileMode, parentDir *directory) *inode { 93 file := ®ularFile{ 94 memoryUsageKind: fs.usage, 95 seals: linux.F_SEAL_SEAL, 96 } 97 file.inode.init(file, fs, kuid, kgid, linux.S_IFREG|mode, parentDir) 98 file.inode.nlink = atomicbitops.FromUint32(1) // from parent directory 99 return &file.inode 100 } 101 102 // newUnlinkedRegularFileDescription creates a regular file on the tmpfs 103 // filesystem represented by mount and returns an FD representing that file. 104 // The new file is not reachable by path traversal from any other file. 105 // 106 // newUnlinkedRegularFileDescription is analogous to Linux's 107 // mm/shmem.c:__shmem_file_setup(). 108 // 109 // Preconditions: mount must be a tmpfs mount. 110 func newUnlinkedRegularFileDescription(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, name string) (*regularFileFD, error) { 111 fs, ok := mount.Filesystem().Impl().(*filesystem) 112 if !ok { 113 panic("tmpfs.newUnlinkedRegularFileDescription() called with non-tmpfs mount") 114 } 115 116 inode := fs.newRegularFile(creds.EffectiveKUID, creds.EffectiveKGID, 0777, nil /* parentDir */) 117 d := fs.newDentry(inode) 118 defer d.DecRef(ctx) 119 d.name = name 120 121 fd := ®ularFileFD{} 122 fd.Init(&inode.locks) 123 flags := uint32(linux.O_RDWR) 124 if err := fd.vfsfd.Init(fd, flags, mount, &d.vfsd, &vfs.FileDescriptionOptions{}); err != nil { 125 return nil, err 126 } 127 return fd, nil 128 } 129 130 // NewZeroFile creates a new regular file and file description as for 131 // mmap(MAP_SHARED | MAP_ANONYMOUS). The file has the given size and is 132 // initially (implicitly) filled with zeroes. 133 // 134 // Preconditions: mount must be a tmpfs mount. 135 func NewZeroFile(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, size uint64) (*vfs.FileDescription, error) { 136 // Compare mm/shmem.c:shmem_zero_setup(). 137 fd, err := newUnlinkedRegularFileDescription(ctx, creds, mount, "dev/zero") 138 if err != nil { 139 return nil, err 140 } 141 rf := fd.inode().impl.(*regularFile) 142 rf.memoryUsageKind = usage.Anonymous 143 rf.size.Store(size) 144 return &fd.vfsfd, err 145 } 146 147 // NewMemfd creates a new regular file and file description as for 148 // memfd_create. 149 // 150 // Preconditions: mount must be a tmpfs mount. 151 func NewMemfd(ctx context.Context, creds *auth.Credentials, mount *vfs.Mount, allowSeals bool, name string) (*vfs.FileDescription, error) { 152 fd, err := newUnlinkedRegularFileDescription(ctx, creds, mount, name) 153 if err != nil { 154 return nil, err 155 } 156 if allowSeals { 157 fd.inode().impl.(*regularFile).seals = 0 158 } 159 return &fd.vfsfd, nil 160 } 161 162 // truncate grows or shrinks the file to the given size. It returns true if the 163 // file size was updated. 164 func (rf *regularFile) truncate(newSize uint64) (bool, error) { 165 rf.inode.mu.Lock() 166 defer rf.inode.mu.Unlock() 167 return rf.truncateLocked(newSize) 168 } 169 170 // Preconditions: 171 // - rf.inode.mu must be held. 172 // - rf.dataMu must be locked for writing. 173 // - newSize > rf.size. 174 func (rf *regularFile) growLocked(newSize uint64) error { 175 // Can we grow the file? 176 if rf.seals&linux.F_SEAL_GROW != 0 { 177 return linuxerr.EPERM 178 } 179 rf.size.Store(newSize) 180 return nil 181 } 182 183 // Preconditions: rf.inode.mu must be held. 184 func (rf *regularFile) truncateLocked(newSize uint64) (bool, error) { 185 oldSize := rf.size.RacyLoad() 186 if newSize == oldSize { 187 // Nothing to do. 188 return false, nil 189 } 190 191 // Need to hold inode.mu and dataMu while modifying size. 192 rf.dataMu.Lock() 193 if newSize > oldSize { 194 err := rf.growLocked(newSize) 195 rf.dataMu.Unlock() 196 return err == nil, err 197 } 198 199 // We are shrinking the file. First check if this is allowed. 200 if rf.seals&linux.F_SEAL_SHRINK != 0 { 201 rf.dataMu.Unlock() 202 return false, linuxerr.EPERM 203 } 204 205 rf.size.Store(newSize) 206 rf.dataMu.Unlock() 207 208 // Invalidate past translations of truncated pages. 209 oldpgend := offsetPageEnd(int64(oldSize)) 210 newpgend := offsetPageEnd(int64(newSize)) 211 if newpgend < oldpgend { 212 rf.mapsMu.Lock() 213 rf.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{ 214 // Compare Linux's mm/shmem.c:shmem_setattr() => 215 // mm/memory.c:unmap_mapping_range(evencows=1). 216 InvalidatePrivate: true, 217 }) 218 rf.mapsMu.Unlock() 219 } 220 221 // We are now guaranteed that there are no translations of truncated pages, 222 // and can remove them. 223 rf.dataMu.Lock() 224 decPages := rf.data.Truncate(newSize, rf.inode.fs.mf) 225 rf.dataMu.Unlock() 226 rf.inode.fs.unaccountPages(decPages) 227 return true, nil 228 } 229 230 // AddMapping implements memmap.Mappable.AddMapping. 231 func (rf *regularFile) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) error { 232 rf.mapsMu.Lock() 233 defer rf.mapsMu.Unlock() 234 rf.dataMu.RLock() 235 defer rf.dataMu.RUnlock() 236 237 // Reject writable mapping if F_SEAL_WRITE is set. 238 if rf.seals&linux.F_SEAL_WRITE != 0 && writable { 239 return linuxerr.EPERM 240 } 241 242 rf.mappings.AddMapping(ms, ar, offset, writable) 243 if writable { 244 pagesBefore := rf.writableMappingPages 245 246 // ar is guaranteed to be page aligned per memmap.Mappable. 247 rf.writableMappingPages += uint64(ar.Length() / hostarch.PageSize) 248 249 if rf.writableMappingPages < pagesBefore { 250 panic(fmt.Sprintf("Overflow while mapping potentially writable pages pointing to a tmpfs file. Before %v, after %v", pagesBefore, rf.writableMappingPages)) 251 } 252 } 253 254 return nil 255 } 256 257 // RemoveMapping implements memmap.Mappable.RemoveMapping. 258 func (rf *regularFile) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar hostarch.AddrRange, offset uint64, writable bool) { 259 rf.mapsMu.Lock() 260 defer rf.mapsMu.Unlock() 261 262 rf.mappings.RemoveMapping(ms, ar, offset, writable) 263 264 if writable { 265 pagesBefore := rf.writableMappingPages 266 267 // ar is guaranteed to be page aligned per memmap.Mappable. 268 rf.writableMappingPages -= uint64(ar.Length() / hostarch.PageSize) 269 270 if rf.writableMappingPages > pagesBefore { 271 panic(fmt.Sprintf("Underflow while unmapping potentially writable pages pointing to a tmpfs file. Before %v, after %v", pagesBefore, rf.writableMappingPages)) 272 } 273 } 274 } 275 276 // CopyMapping implements memmap.Mappable.CopyMapping. 277 func (rf *regularFile) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR hostarch.AddrRange, offset uint64, writable bool) error { 278 return rf.AddMapping(ctx, ms, dstAR, offset, writable) 279 } 280 281 // Translate implements memmap.Mappable.Translate. 282 func (rf *regularFile) Translate(ctx context.Context, required, optional memmap.MappableRange, at hostarch.AccessType) ([]memmap.Translation, error) { 283 rf.dataMu.Lock() 284 defer rf.dataMu.Unlock() 285 286 // Constrain translations to f.attr.Size (rounded up) to prevent 287 // translation to pages that may be concurrently truncated. 288 pgend := offsetPageEnd(int64(rf.size.RacyLoad())) 289 var beyondEOF bool 290 if required.End > pgend { 291 if required.Start >= pgend { 292 return nil, &memmap.BusError{io.EOF} 293 } 294 beyondEOF = true 295 required.End = pgend 296 } 297 if optional.End > pgend { 298 optional.End = pgend 299 } 300 pagesToFill := rf.data.PagesToFill(required, optional) 301 if !rf.inode.fs.accountPages(pagesToFill) { 302 // If we can not accommodate pagesToFill pages, then retry with just 303 // the required range. Because optional may be larger than required. 304 // Only error out if even the required range can not be allocated for. 305 pagesToFill = rf.data.PagesToFill(required, required) 306 if !rf.inode.fs.accountPages(pagesToFill) { 307 return nil, &memmap.BusError{linuxerr.ENOSPC} 308 } 309 optional = required 310 } 311 pagesAlloced, cerr := rf.data.Fill(ctx, required, optional, rf.size.RacyLoad(), rf.inode.fs.mf, rf.memoryUsageKind, pgalloc.AllocateOnly, nil /* r */) 312 // rf.data.Fill() may fail mid-way. We still want to account any pages that 313 // were allocated, irrespective of an error. 314 rf.inode.fs.adjustPageAcct(pagesToFill, pagesAlloced) 315 316 var ts []memmap.Translation 317 var translatedEnd uint64 318 for seg := rf.data.FindSegment(required.Start); seg.Ok() && seg.Start() < required.End; seg, _ = seg.NextNonEmpty() { 319 segMR := seg.Range().Intersect(optional) 320 ts = append(ts, memmap.Translation{ 321 Source: segMR, 322 File: rf.inode.fs.mf, 323 Offset: seg.FileRangeOf(segMR).Start, 324 Perms: hostarch.AnyAccess, 325 }) 326 translatedEnd = segMR.End 327 } 328 329 // Don't return the error returned by f.data.Fill if it occurred outside of 330 // required. 331 if translatedEnd < required.End && cerr != nil { 332 return ts, &memmap.BusError{cerr} 333 } 334 if beyondEOF { 335 return ts, &memmap.BusError{io.EOF} 336 } 337 return ts, nil 338 } 339 340 // InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable. 341 func (*regularFile) InvalidateUnsavable(context.Context) error { 342 return nil 343 } 344 345 // +stateify savable 346 type regularFileFD struct { 347 fileDescription 348 349 // off is the file offset. off is accessed using atomic memory operations. 350 // offMu serializes operations that may mutate off. 351 off int64 352 offMu sync.Mutex `state:"nosave"` 353 } 354 355 // Release implements vfs.FileDescriptionImpl.Release. 356 func (fd *regularFileFD) Release(context.Context) { 357 // noop 358 } 359 360 // Allocate implements vfs.FileDescriptionImpl.Allocate. 361 func (fd *regularFileFD) Allocate(ctx context.Context, mode, offset, length uint64) error { 362 f := fd.inode().impl.(*regularFile) 363 // To be consistent with Linux, inode.mu must be locked throughout. 364 f.inode.mu.Lock() 365 defer f.inode.mu.Unlock() 366 end := offset + length 367 pgEnd, ok := hostarch.PageRoundUp(end) 368 if !ok { 369 return linuxerr.EFBIG 370 } 371 // Allocate in chunks for the following reasons: 372 // 1. Size limit may permit really large fallocate, which can take a long 373 // time to execute on the host. This can cause watchdog to timeout and 374 // crash the system. Watchdog needs petting. 375 // 2. Linux allocates folios iteratively while checking for interrupts. In 376 // gVisor, we need to manually check for interrupts between chunks. 377 const chunkSize = 4 << 30 // 4 GiB 378 for curPgStart := hostarch.PageRoundDown(offset); curPgStart < pgEnd; { 379 curPgEnd := pgEnd 380 newSize := end 381 if curPgEnd-curPgStart > chunkSize { 382 curPgEnd = curPgStart + chunkSize 383 newSize = curPgEnd 384 } 385 required := memmap.MappableRange{Start: curPgStart, End: curPgEnd} 386 if err := f.allocateLocked(ctx, mode, newSize, required); err != nil { 387 return err 388 } 389 // This loop can take a long time to process, so periodically check for 390 // interrupts. This also pets the watchdog. 391 if ctx.Interrupted() { 392 return linuxerr.EINTR 393 } 394 // Advance curPgStart. 395 curPgStart = curPgEnd 396 } 397 return nil 398 } 399 400 // Preconditions: 401 // - rf.inode.mu is locked. 402 // - required must be page-aligned. 403 // - required.Start < newSize <= required.End. 404 func (rf *regularFile) allocateLocked(ctx context.Context, mode, newSize uint64, required memmap.MappableRange) error { 405 rf.dataMu.Lock() 406 defer rf.dataMu.Unlock() 407 408 // We must allocate pages in the range specified by offset and length. 409 // Even if newSize <= oldSize, there might not be actual memory backing this 410 // range, so any gaps must be filled by calling f.data.Fill(). 411 // "After a successful call, subsequent writes into the range 412 // specified by offset and len are guaranteed not to fail because of 413 // lack of disk space." - fallocate(2) 414 pagesToFill := rf.data.PagesToFill(required, required) 415 if !rf.inode.fs.accountPages(pagesToFill) { 416 return linuxerr.ENOSPC 417 } 418 // Given our definitions in pgalloc, fallocate(2) semantics imply that pages 419 // in the MemoryFile must be committed, in addition to being allocated. 420 allocMode := pgalloc.AllocateAndCommit 421 if !rf.inode.fs.mf.IsDiskBacked() { 422 // Upgrade to AllocateAndWritePopulate for memory(shmem)-backed files. We 423 // take a more aggressive approach in populating pages for memory-backed 424 // MemoryFiles. shmem pages are subject to swap rather than disk writeback. 425 // They are not likely to be swapped before they are written to. Hence it 426 // is beneficial to populate (in addition to commit) shmem pages to avoid 427 // faulting page-by-page when these pages are written to in the future. 428 allocMode = pgalloc.AllocateAndWritePopulate 429 } 430 pagesAlloced, err := rf.data.Fill(ctx, required, required, newSize, rf.inode.fs.mf, rf.memoryUsageKind, allocMode, nil /* r */) 431 // f.data.Fill() may fail mid-way. We still want to account any pages that 432 // were allocated, irrespective of an error. 433 rf.inode.fs.adjustPageAcct(pagesToFill, pagesAlloced) 434 if err != nil && err != io.EOF { 435 return err 436 } 437 438 oldSize := rf.size.Load() 439 if oldSize >= newSize { 440 return nil 441 } 442 return rf.growLocked(newSize) 443 } 444 445 // PRead implements vfs.FileDescriptionImpl.PRead. 446 func (fd *regularFileFD) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) { 447 start := fsmetric.StartReadWait() 448 defer fsmetric.FinishReadWait(fsmetric.TmpfsReadWait, start) 449 fsmetric.TmpfsReads.Increment() 450 451 if offset < 0 { 452 return 0, linuxerr.EINVAL 453 } 454 455 // Check that flags are supported. RWF_DSYNC/RWF_SYNC can be ignored since 456 // all state is in-memory. 457 // 458 // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. 459 if opts.Flags&^(linux.RWF_HIPRI|linux.RWF_DSYNC|linux.RWF_SYNC) != 0 { 460 return 0, linuxerr.EOPNOTSUPP 461 } 462 463 if dst.NumBytes() == 0 { 464 return 0, nil 465 } 466 f := fd.inode().impl.(*regularFile) 467 rw := getRegularFileReadWriter(f, offset, 0) 468 n, err := dst.CopyOutFrom(ctx, rw) 469 putRegularFileReadWriter(rw) 470 fd.inode().touchAtime(fd.vfsfd.Mount()) 471 return n, err 472 } 473 474 // Read implements vfs.FileDescriptionImpl.Read. 475 func (fd *regularFileFD) Read(ctx context.Context, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) { 476 fd.offMu.Lock() 477 n, err := fd.PRead(ctx, dst, fd.off, opts) 478 fd.off += n 479 fd.offMu.Unlock() 480 return n, err 481 } 482 483 // PWrite implements vfs.FileDescriptionImpl.PWrite. 484 func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) { 485 n, _, err := fd.pwrite(ctx, src, offset, opts) 486 return n, err 487 } 488 489 // pwrite returns the number of bytes written, final offset and error. The 490 // final offset should be ignored by PWrite. 491 func (fd *regularFileFD) pwrite(ctx context.Context, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (written, finalOff int64, err error) { 492 if offset < 0 { 493 return 0, offset, linuxerr.EINVAL 494 } 495 496 // Check that flags are supported. RWF_DSYNC/RWF_SYNC can be ignored since 497 // all state is in-memory. 498 // 499 // TODO(gvisor.dev/issue/2601): Support select preadv2 flags. 500 if opts.Flags&^(linux.RWF_HIPRI|linux.RWF_DSYNC|linux.RWF_SYNC) != 0 { 501 return 0, offset, linuxerr.EOPNOTSUPP 502 } 503 504 srclen := src.NumBytes() 505 if srclen == 0 { 506 return 0, offset, nil 507 } 508 f := fd.inode().impl.(*regularFile) 509 f.inode.mu.Lock() 510 defer f.inode.mu.Unlock() 511 // If the file is opened with O_APPEND, update offset to file size. 512 if fd.vfsfd.StatusFlags()&linux.O_APPEND != 0 { 513 // Locking f.inode.mu is sufficient for reading f.size. 514 offset = int64(f.size.RacyLoad()) 515 } 516 end := offset + srclen 517 if end < offset { 518 // Overflow. 519 return 0, offset, linuxerr.EINVAL 520 } 521 522 srclen, err = vfs.CheckLimit(ctx, offset, srclen) 523 if err != nil { 524 return 0, offset, err 525 } 526 src = src.TakeFirst64(srclen) 527 528 // Perform the write. 529 rw := getRegularFileReadWriter(f, offset, pgalloc.MemoryCgroupIDFromContext(ctx)) 530 n, err := src.CopyInTo(ctx, rw) 531 532 f.inode.touchCMtimeLocked() 533 for { 534 old := f.inode.mode.Load() 535 new := vfs.ClearSUIDAndSGID(old) 536 if swapped := f.inode.mode.CompareAndSwap(old, new); swapped { 537 break 538 } 539 } 540 putRegularFileReadWriter(rw) 541 return n, n + offset, err 542 } 543 544 // Write implements vfs.FileDescriptionImpl.Write. 545 func (fd *regularFileFD) Write(ctx context.Context, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) { 546 fd.offMu.Lock() 547 n, off, err := fd.pwrite(ctx, src, fd.off, opts) 548 fd.off = off 549 fd.offMu.Unlock() 550 return n, err 551 } 552 553 // Seek implements vfs.FileDescriptionImpl.Seek. 554 func (fd *regularFileFD) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { 555 fd.offMu.Lock() 556 defer fd.offMu.Unlock() 557 switch whence { 558 case linux.SEEK_SET: 559 // use offset as specified 560 case linux.SEEK_CUR: 561 offset += fd.off 562 case linux.SEEK_END: 563 offset += int64(fd.inode().impl.(*regularFile).size.Load()) 564 default: 565 return 0, linuxerr.EINVAL 566 } 567 if offset < 0 { 568 return 0, linuxerr.EINVAL 569 } 570 fd.off = offset 571 return offset, nil 572 } 573 574 // ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap. 575 func (fd *regularFileFD) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error { 576 file := fd.inode().impl.(*regularFile) 577 opts.SentryOwnedContent = true 578 return vfs.GenericConfigureMMap(&fd.vfsfd, file, opts) 579 } 580 581 // offsetPageEnd returns the file offset rounded up to the nearest 582 // page boundary. offsetPageEnd panics if rounding up causes overflow, 583 // which shouldn't be possible given that offset is an int64. 584 func offsetPageEnd(offset int64) uint64 { 585 end, ok := hostarch.Addr(offset).RoundUp() 586 if !ok { 587 panic("impossible overflow") 588 } 589 return uint64(end) 590 } 591 592 // regularFileReadWriter implements safemem.Reader and Safemem.Writer. 593 type regularFileReadWriter struct { 594 file *regularFile 595 596 // Offset into the file to read/write at. Note that this may be 597 // different from the FD offset if PRead/PWrite is used. 598 off uint64 599 600 // memCgID is the memory cgroup ID used for accounting the allocated 601 // pages. 602 memCgID uint32 603 } 604 605 var regularFileReadWriterPool = sync.Pool{ 606 New: func() any { 607 return ®ularFileReadWriter{} 608 }, 609 } 610 611 func getRegularFileReadWriter(file *regularFile, offset int64, memCgID uint32) *regularFileReadWriter { 612 rw := regularFileReadWriterPool.Get().(*regularFileReadWriter) 613 rw.file = file 614 rw.off = uint64(offset) 615 rw.memCgID = memCgID 616 return rw 617 } 618 619 func putRegularFileReadWriter(rw *regularFileReadWriter) { 620 rw.file = nil 621 regularFileReadWriterPool.Put(rw) 622 } 623 624 // ReadToBlocks implements safemem.Reader.ReadToBlocks. 625 func (rw *regularFileReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) { 626 rw.file.dataMu.RLock() 627 defer rw.file.dataMu.RUnlock() 628 size := rw.file.size.RacyLoad() 629 630 // Compute the range to read (limited by file size and overflow-checked). 631 if rw.off >= size { 632 return 0, io.EOF 633 } 634 end := size 635 if rend := rw.off + dsts.NumBytes(); rend > rw.off && rend < end { 636 end = rend 637 } 638 639 var done uint64 640 seg, gap := rw.file.data.Find(uint64(rw.off)) 641 for rw.off < end { 642 mr := memmap.MappableRange{uint64(rw.off), uint64(end)} 643 switch { 644 case seg.Ok(): 645 // Get internal mappings. 646 ims, err := rw.file.inode.fs.mf.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), hostarch.Read) 647 if err != nil { 648 return done, err 649 } 650 651 // Copy from internal mappings. 652 n, err := safemem.CopySeq(dsts, ims) 653 done += n 654 rw.off += uint64(n) 655 dsts = dsts.DropFirst64(n) 656 if err != nil { 657 return done, err 658 } 659 660 // Continue. 661 seg, gap = seg.NextNonEmpty() 662 663 case gap.Ok(): 664 // Tmpfs holes are zero-filled. 665 gapmr := gap.Range().Intersect(mr) 666 dst := dsts.TakeFirst64(gapmr.Length()) 667 n, err := safemem.ZeroSeq(dst) 668 done += n 669 rw.off += uint64(n) 670 dsts = dsts.DropFirst64(n) 671 if err != nil { 672 return done, err 673 } 674 675 // Continue. 676 seg, gap = gap.NextSegment(), fsutil.FileRangeGapIterator{} 677 } 678 } 679 return done, nil 680 } 681 682 // WriteFromBlocks implements safemem.Writer.WriteFromBlocks. 683 // 684 // Preconditions: rw.file.inode.mu must be held. 685 func (rw *regularFileReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) { 686 // Hold dataMu so we can modify size. 687 rw.file.dataMu.Lock() 688 defer rw.file.dataMu.Unlock() 689 690 // Compute the range to write (overflow-checked). 691 end := rw.off + srcs.NumBytes() 692 if end <= rw.off { 693 end = math.MaxInt64 694 } 695 696 // Check if seals prevent either file growth or all writes. 697 switch { 698 case rw.file.seals&linux.F_SEAL_WRITE != 0: // Write sealed 699 return 0, linuxerr.EPERM 700 case end > rw.file.size.RacyLoad() && rw.file.seals&linux.F_SEAL_GROW != 0: // Grow sealed 701 // When growth is sealed, Linux effectively allows writes which would 702 // normally grow the file to partially succeed up to the current EOF, 703 // rounded down to the page boundary before the EOF. 704 // 705 // This happens because writes (and thus the growth check) for tmpfs 706 // files proceed page-by-page on Linux, and the final write to the page 707 // containing EOF fails, resulting in a partial write up to the start of 708 // that page. 709 // 710 // To emulate this behaviour, artificially truncate the write to the 711 // start of the page containing the current EOF. 712 // 713 // See Linux, mm/filemap.c:generic_perform_write() and 714 // mm/shmem.c:shmem_write_begin(). 715 if pgstart := uint64(hostarch.Addr(rw.file.size.RacyLoad()).RoundDown()); end > pgstart { 716 end = pgstart 717 } 718 if end <= rw.off { 719 // Truncation would result in no data being written. 720 return 0, linuxerr.EPERM 721 } 722 } 723 724 // Page-aligned mr for when we need to allocate memory. RoundUp can't 725 // overflow since end is an int64. 726 pgstartaddr := hostarch.Addr(rw.off).RoundDown() 727 pgendaddr, _ := hostarch.Addr(end).RoundUp() 728 pgMR := memmap.MappableRange{uint64(pgstartaddr), uint64(pgendaddr)} 729 730 var ( 731 done uint64 732 retErr error 733 ) 734 seg, gap := rw.file.data.Find(uint64(rw.off)) 735 for rw.off < end { 736 mr := memmap.MappableRange{uint64(rw.off), uint64(end)} 737 switch { 738 case seg.Ok(): 739 n, err := rw.writeToMF(seg.FileRangeOf(seg.Range().Intersect(mr)), srcs) 740 done += n 741 rw.off += uint64(n) 742 srcs = srcs.DropFirst64(n) 743 if err != nil { 744 retErr = err 745 goto exitLoop 746 } 747 748 // Continue. 749 seg, gap = seg.NextNonEmpty() 750 751 case gap.Ok(): 752 // Allocate memory for the write. 753 gapMR := gap.Range().Intersect(pgMR) 754 pagesToFill := gapMR.Length() / hostarch.PageSize 755 pagesReserved := rw.file.inode.fs.accountPagesPartial(pagesToFill) 756 if pagesReserved == 0 { 757 if done == 0 { 758 retErr = linuxerr.ENOSPC 759 goto exitLoop 760 } 761 retErr = nil 762 goto exitLoop 763 } 764 gapMR.End = gapMR.Start + (hostarch.PageSize * pagesReserved) 765 allocMode := pgalloc.AllocateAndWritePopulate 766 if rw.file.inode.fs.mf.IsDiskBacked() { 767 // Don't populate pages for disk-backed files. Benchmarking showed that 768 // disk-backed pages are likely to be written back to disk before we 769 // can write to them. The pages fault again on write anyways. In total, 770 // prepopulating disk-backed pages deteriorates performance as it fails 771 // to eliminate future page faults and we also additionally incur 772 // useless disk writebacks. 773 allocMode = pgalloc.AllocateOnly 774 } 775 fr, err := rw.file.inode.fs.mf.Allocate(gapMR.Length(), pgalloc.AllocOpts{ 776 Kind: rw.file.memoryUsageKind, 777 Mode: allocMode, 778 MemCgID: rw.memCgID, 779 }) 780 if err != nil { 781 retErr = err 782 rw.file.inode.fs.unaccountPages(pagesReserved) 783 goto exitLoop 784 } 785 786 // Write to that memory as usual. 787 seg, gap = rw.file.data.Insert(gap, gapMR, fr.Start), fsutil.FileRangeGapIterator{} 788 default: 789 panic("unreachable") 790 } 791 } 792 exitLoop: 793 // If the write ends beyond the file's previous size, it causes the 794 // file to grow. 795 if rw.off > rw.file.size.RacyLoad() { 796 rw.file.size.Store(rw.off) 797 } 798 799 return done, retErr 800 } 801 802 func (rw *regularFileReadWriter) writeToMF(fr memmap.FileRange, srcs safemem.BlockSeq) (uint64, error) { 803 if rw.file.inode.fs.mf.IsDiskBacked() { 804 // Disk-backed files are not prepopulated. The safemem.CopySeq() approach 805 // used below incurs a lot of page faults without page prepopulation, which 806 // causes a lot of context switching. Use write(2) host syscall instead, 807 // which makes one context switch and faults all the pages that are touched 808 // during the write. 809 return hostfd.Pwritev2( 810 int32(rw.file.inode.fs.mf.FD()), // fd 811 srcs.TakeFirst64(fr.Length()), // srcs 812 int64(fr.Start), // offset 813 0, // flags 814 ) 815 } 816 // Get internal mappings. 817 ims, err := rw.file.inode.fs.mf.MapInternal(fr, hostarch.Write) 818 if err != nil { 819 return 0, err 820 } 821 // Copy to internal mappings. 822 return safemem.CopySeq(ims, srcs) 823 } 824 825 // GetSeals returns the current set of seals on a memfd inode. 826 func GetSeals(fd *vfs.FileDescription) (uint32, error) { 827 f, ok := fd.Impl().(*regularFileFD) 828 if !ok { 829 return 0, linuxerr.EINVAL 830 } 831 rf := f.inode().impl.(*regularFile) 832 rf.dataMu.RLock() 833 defer rf.dataMu.RUnlock() 834 return rf.seals, nil 835 } 836 837 // AddSeals adds new file seals to a memfd inode. 838 func AddSeals(fd *vfs.FileDescription, val uint32) error { 839 f, ok := fd.Impl().(*regularFileFD) 840 if !ok { 841 return linuxerr.EINVAL 842 } 843 rf := f.inode().impl.(*regularFile) 844 rf.mapsMu.Lock() 845 defer rf.mapsMu.Unlock() 846 rf.dataMu.Lock() 847 defer rf.dataMu.Unlock() 848 849 if rf.seals&linux.F_SEAL_SEAL != 0 { 850 // Seal applied which prevents addition of any new seals. 851 return linuxerr.EPERM 852 } 853 854 // F_SEAL_WRITE can only be added if there are no active writable maps. 855 if rf.seals&linux.F_SEAL_WRITE == 0 && val&linux.F_SEAL_WRITE != 0 { 856 if rf.writableMappingPages > 0 { 857 return linuxerr.EBUSY 858 } 859 } 860 861 // Seals can only be added, never removed. 862 rf.seals |= val 863 return nil 864 }